-
Notifications
You must be signed in to change notification settings - Fork 152
Description
batch data
I crashed by many times in batch.to(device) may because fsdp?
DataProto(batch=TensorDict(
�[36m(TaskRunner pid=73703)�[0m fields={
�[36m(TaskRunner pid=73703)�[0m attention_mask: Tensor(shape=torch.Size([56, 16384]), device=cpu, dtype=torch.int64, is_shared=False),
�[36m(TaskRunner pid=73703)�[0m input_ids: Tensor(shape=torch.Size([56, 16384]), device=cpu, dtype=torch.int64, is_shared=False),
�[36m(TaskRunner pid=73703)�[0m is_drop_mask: Tensor(shape=torch.Size([56]), device=cpu, dtype=torch.bool, is_shared=False),
�[36m(TaskRunner pid=73703)�[0m position_ids: Tensor(shape=torch.Size([56, 16384]), device=cpu, dtype=torch.int64, is_shared=False),
�[36m(TaskRunner pid=73703)�[0m prompts: Tensor(shape=torch.Size([56, 15360]), device=cpu, dtype=torch.int64, is_shared=False),
�[36m(TaskRunner pid=73703)�[0m response_mask: Tensor(shape=torch.Size([56, 1024]), device=cpu, dtype=torch.int64, is_shared=False),
�[36m(TaskRunner pid=73703)�[0m responses: Tensor(shape=torch.Size([56, 1024]), device=cpu, dtype=torch.int64, is_shared=False),
�[36m(TaskRunner pid=73703)�[0m token_level_scores: Tensor(shape=torch.Size([56, 1024]), device=cpu, dtype=torch.bfloat16, is_shared=False)},
�[36m(TaskRunner pid=73703)�[0m batch_size=torch.Size([56]),
�[36m(TaskRunner pid=73703)�[0m device=None,
Error MSG:
...
Exception in thread Thread-3 (_loop_forever):
�[36m(WorkerDict pid=74817)�[0m Traceback (most recent call last):
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/threading.py", line 1075, in _bootstrap_inner
�[36m(WorkerDict pid=74817)�[0m self.run()
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/threading.py", line 1012, in run
�[36m(WorkerDict pid=74817)�[0m self._target(*self._args, **self._kwargs)
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py", line 453, in _loop_forever
�[36m(WorkerDict pid=74817)�[0m result = self.execute_method(method, *args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py", line 501, in execute_method
�[36m(WorkerDict pid=74817)�[0m return self.inference_engine.execute_method(method, *args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 628, in execute_method
�[36m(WorkerDict pid=74817)�[0m raise e
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 619, in execute_method
�[36m(WorkerDict pid=74817)�[0m return run_method(self, method, args, kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/utils/init.py", line 3060, in run_method
�[36m(WorkerDict pid=74817)�[0m return func(*args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
�[36m(WorkerDict pid=74817)�[0m return func(*args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/v1/worker/gpu_worker.py", line 436, in execute_model
�[36m(WorkerDict pid=74817)�[0m output = self.model_runner.execute_model(scheduler_output,
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
�[36m(WorkerDict pid=74817)�[0m return func(*args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py", line 2129, in execute_model
�[36m(WorkerDict pid=74817)�[0m ) = self._bookkeeping_sync(scheduler_output, sampler_output,
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py", line 1929, in _bookkeeping_sync
�[36m(WorkerDict pid=74817)�[0m valid_sampled_token_ids = self._to_list(sampled_token_ids)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py", line 3742, in _to_list
�[36m(WorkerDict pid=74817)�[0m self.transfer_event.synchronize()
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/torch/cuda/streams.py", line 231, in synchronize
�[36m(WorkerDict pid=74817)�[0m super().synchronize()
�[36m(WorkerDict pid=74817)�[0m torch.AcceleratorError: CUDA error: an illegal memory access was encountered
Exception in thread Thread-3 (_loop_forever):
�[36m(WorkerDict pid=74817)�[0m Traceback (most recent call last):
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/threading.py", line 1075, in _bootstrap_inner
�[36m(WorkerDict pid=74817)�[0m self.run()
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/threading.py", line 1012, in run
�[36m(WorkerDict pid=74817)�[0m self._target(*self._args, **self._kwargs)
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py", line 453, in _loop_forever
�[36m(WorkerDict pid=74817)�[0m result = self.execute_method(method, *args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py", line 501, in execute_method
�[36m(WorkerDict pid=74817)�[0m return self.inference_engine.execute_method(method, *args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 628, in execute_method
�[36m(WorkerDict pid=74817)�[0m raise e
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 619, in execute_method
�[36m(WorkerDict pid=74817)�[0m return run_method(self, method, args, kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/utils/init.py", line 3060, in run_method
�[36m(WorkerDict pid=74817)�[0m return func(*args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
�[36m(WorkerDict pid=74817)�[0m return func(*args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/v1/worker/gpu_worker.py", line 436, in execute_model
�[36m(WorkerDict pid=74817)�[0m output = self.model_runner.execute_model(scheduler_output,
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
�[36m(WorkerDict pid=74817)�[0m return func(*args, **kwargs)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py", line 2129, in execute_model
�[36m(WorkerDict pid=74817)�[0m ) = self._bookkeeping_sync(scheduler_output, sampler_output,
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py", line 1929, in _bookkeeping_sync
�[36m(WorkerDict pid=74817)�[0m valid_sampled_token_ids = self._to_list(sampled_token_ids)
�[36m(WorkerDict pid=74817)�[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py", line 3742, in _to_list
�[36m(WorkerDict pid=74817)�[0m self.transfer_event.synchronize()
�[36m(WorkerDict pid=74817)�[0m File "/opt/conda/lib/python3.12/site-packages/torch/cuda/streams.py", line 231, in synchronize
�[36m(WorkerDict pid=74817)�[0m super().synchronize()
�[36m(WorkerDict pid=74817)�[0m torch.AcceleratorError: CUDA error: an illegal memory access was encountered