Skip to content

I think dont put compute_response_mask in trainer.py, otherwise response_mask may not contiguous, put it in deamon.py? #119

@af-74413592

Description

@af-74413592

batch data
I crashed by many times in batch.to(device) may because fsdp?
DataProto(batch=TensorDict(
�[36m(TaskRunner pid=73703)�[0m fields={
�[36m(TaskRunner pid=73703)�[0m attention_mask: Tensor(shape=torch.Size([56, 16384]), device=cpu, dtype=torch.int64, is_shared=False),
�[36m(TaskRunner pid=73703)�[0m input_ids: Tensor(shape=torch.Size([56, 16384]), device=cpu, dtype=torch.int64, is_shared=False),
�[36m(TaskRunner pid=73703)�[0m is_drop_mask: Tensor(shape=torch.Size([56]), device=cpu, dtype=torch.bool, is_shared=False),
�[36m(TaskRunner pid=73703)�[0m position_ids: Tensor(shape=torch.Size([56, 16384]), device=cpu, dtype=torch.int64, is_shared=False),
�[36m(TaskRunner pid=73703)�[0m prompts: Tensor(shape=torch.Size([56, 15360]), device=cpu, dtype=torch.int64, is_shared=False),
�[36m(TaskRunner pid=73703)�[0m response_mask: Tensor(shape=torch.Size([56, 1024]), device=cpu, dtype=torch.int64, is_shared=False),
�[36m(TaskRunner pid=73703)�[0m responses: Tensor(shape=torch.Size([56, 1024]), device=cpu, dtype=torch.int64, is_shared=False),
�[36m(TaskRunner pid=73703)�[0m token_level_scores: Tensor(shape=torch.Size([56, 1024]), device=cpu, dtype=torch.bfloat16, is_shared=False)},
�[36m(TaskRunner pid=73703)�[0m batch_size=torch.Size([56]),
�[36m(TaskRunner pid=73703)�[0m device=None,
Error MSG:
...
Exception in thread Thread-3 (_loop_forever):
�[36m(WorkerDict pid=74817)�[0m Traceback (most recent call last):
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/threading.py", line 1075, in _bootstrap_inner
�[36m(WorkerDict pid=74817)�[0m self.run()
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/threading.py", line 1012, in run
�[36m(WorkerDict pid=74817)�[0m self._target(*self._args, **self._kwargs)
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py", line 453, in _loop_forever
�[36m(WorkerDict pid=74817)�[0m result = self.execute_method(method, *args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py", line 501, in execute_method
�[36m(WorkerDict pid=74817)�[0m return self.inference_engine.execute_method(method, *args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 628, in execute_method
�[36m(WorkerDict pid=74817)�[0m raise e
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 619, in execute_method
�[36m(WorkerDict pid=74817)�[0m return run_method(self, method, args, kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/utils/init.py", line 3060, in run_method
�[36m(WorkerDict pid=74817)�[0m return func(*args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
�[36m(WorkerDict pid=74817)�[0m return func(*args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/v1/worker/gpu_worker.py", line 436, in execute_model
�[36m(WorkerDict pid=74817)�[0m output = self.model_runner.execute_model(scheduler_output,
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
�[36m(WorkerDict pid=74817)�[0m return func(*args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py", line 2129, in execute_model
�[36m(WorkerDict pid=74817)�[0m ) = self._bookkeeping_sync(scheduler_output, sampler_output,
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py", line 1929, in _bookkeeping_sync
�[36m(WorkerDict pid=74817)�[0m valid_sampled_token_ids = self._to_list(sampled_token_ids)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py", line 3742, in _to_list
�[36m(WorkerDict pid=74817)�[0m self.transfer_event.synchronize()
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/torch/cuda/streams.py", line 231, in synchronize
�[36m(WorkerDict pid=74817)�[0m super().synchronize()
�[36m(WorkerDict pid=74817)�[0m torch.AcceleratorError: CUDA error: an illegal memory access was encountered
Exception in thread Thread-3 (_loop_forever):
�[36m(WorkerDict pid=74817)�[0m Traceback (most recent call last):
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/threading.py", line 1075, in _bootstrap_inner
�[36m(WorkerDict pid=74817)�[0m self.run()
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/threading.py", line 1012, in run
�[36m(WorkerDict pid=74817)�[0m self._target(*self._args, **self._kwargs)
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py", line 453, in _loop_forever
�[36m(WorkerDict pid=74817)�[0m result = self.execute_method(method, *args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py", line 501, in execute_method
�[36m(WorkerDict pid=74817)�[0m return self.inference_engine.execute_method(method, *args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 628, in execute_method
�[36m(WorkerDict pid=74817)�[0m raise e
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 619, in execute_method
�[36m(WorkerDict pid=74817)�[0m return run_method(self, method, args, kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/utils/init.py", line 3060, in run_method
�[36m(WorkerDict pid=74817)�[0m return func(*args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
�[36m(WorkerDict pid=74817)�[0m return func(*args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/v1/worker/gpu_worker.py", line 436, in execute_model
�[36m(WorkerDict pid=74817)�[0m output = self.model_runner.execute_model(scheduler_output,
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
�[36m(WorkerDict pid=74817)�[0m return func(*args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py", line 2129, in execute_model
�[36m(WorkerDict pid=74817)�[0m ) = self._bookkeeping_sync(scheduler_output, sampler_output,
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py", line 1929, in _bookkeeping_sync
�[36m(WorkerDict pid=74817)�[0m valid_sampled_token_ids = self._to_list(sampled_token_ids)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py", line 3742, in _to_list
�[36m(WorkerDict pid=74817)�[0m self.transfer_event.synchronize()
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/torch/cuda/streams.py", line 231, in synchronize
�[36m(WorkerDict pid=74817)�[0m super().synchronize()
�[36m(WorkerDict pid=74817)�[0m torch.AcceleratorError: CUDA error: an illegal memory access was encountered

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions