Skip to content

can't run the notebook locally  #63

@sylvain471

Description

@sylvain471

Hello, very interested with this work I am trying to run it locally.

However I am stuck at the cell

# Extract sections
sections_ds = ds.flat_map(extract_sections)
sections_ds.count() 

sections_ds.count() throws the following error, any idea about what may solve this issue?

{
	"name": "RayTaskError(FileNotFoundError)",
	"message": "ray::FlatMap(extract_sections)() (pid=153397, ip=192.168.1.82)
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_operator.py\", line 405, in _map_task
    for b_out in map_transformer.apply_transform(iter(blocks), ctx):
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 345, in __call__
    for data in iter:
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 171, in __call__
    yield from self._row_fn(input, ctx)
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 245, in transform_fn
    for out_row in fn(row):
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 119, in fn
    return op_fn(item, *fn_args, **fn_kwargs)
  File \"/tmp/ray/session_2023-10-11_12-45-18_995895_152214/runtime_resources/working_dir_files/_ray_pkg_74b1a494592133c8/rag/data.py\", line 29, in extract_sections
    with open(record[\"path\"], \"r\", encoding=\"utf-8\") as html_file:
FileNotFoundError: [Errno 2] No such file or directory: 'docs.ray.io/en/master/tune.html'",
	"stack": "---------------------------------------------------------------------------
ObjectRefStreamEndOfStreamError           Traceback (most recent call last)
File python/ray/_raylet.pyx:345, in ray._raylet.StreamingObjectRefGenerator._next_sync()

File python/ray/_raylet.pyx:4533, in ray._raylet.CoreWorker.try_read_next_object_ref_stream()

File python/ray/_raylet.pyx:443, in ray._raylet.check_status()

ObjectRefStreamEndOfStreamError: 

During handling of the above exception, another exception occurred:

StopIteration                             Traceback (most recent call last)
File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py:80, in DataOpTask.on_waitable_ready(self)
     79 try:
---> 80     meta = ray.get(next(self._streaming_gen))
     81 except StopIteration:
     82     # The generator should always yield 2 values (block and metadata)
     83     # each time. If we get a StopIteration here, it means an error
   (...)
     86     # TODO(hchen): Ray Core should have a better interface for
     87     # detecting and obtaining the exception.

File python/ray/_raylet.pyx:300, in ray._raylet.StreamingObjectRefGenerator.__next__()

File python/ray/_raylet.pyx:351, in ray._raylet.StreamingObjectRefGenerator._next_sync()

StopIteration: 

During handling of the above exception, another exception occurred:

RayTaskError(FileNotFoundError)           Traceback (most recent call last)
/home/sylvain/Documents/471/LLM/ray_pgvector/llm-applications/ray_pgvector.ipynb Cell 20 line 4
      <a href='vscode-notebook-cell:/home/sylvain/Documents/471/LLM/ray_pgvector/llm-applications/ray_pgvector.ipynb#X20sZmlsZQ%3D%3D?line=0'>1</a> # Extract sections
      <a href='vscode-notebook-cell:/home/sylvain/Documents/471/LLM/ray_pgvector/llm-applications/ray_pgvector.ipynb#X20sZmlsZQ%3D%3D?line=1'>2</a> #ray.data.DataContext.get_current().execution_options.verbose_progress = True
      <a href='vscode-notebook-cell:/home/sylvain/Documents/471/LLM/ray_pgvector/llm-applications/ray_pgvector.ipynb#X20sZmlsZQ%3D%3D?line=2'>3</a> sections_ds = ds.flat_map(extract_sections)
----> <a href='vscode-notebook-cell:/home/sylvain/Documents/471/LLM/ray_pgvector/llm-applications/ray_pgvector.ipynb#X20sZmlsZQ%3D%3D?line=3'>4</a> sections_ds.count()

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/dataset.py:2498, in Dataset.count(self)
   2492     return meta_count
   2494 get_num_rows = cached_remote_fn(_get_num_rows)
   2496 return sum(
   2497     ray.get(
-> 2498         [get_num_rows.remote(block) for block in self.get_internal_block_refs()]
   2499     )
   2500 )

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/dataset.py:4799, in Dataset.get_internal_block_refs(self)
   4780 @ConsumptionAPI(pattern=\"Time complexity:\")
   4781 @DeveloperAPI
   4782 def get_internal_block_refs(self) -> List[ObjectRef[Block]]:
   4783     \"\"\"Get a list of references to the underlying blocks of this dataset.
   4784 
   4785     This function can be used for zero-copy access to the data. It blocks
   (...)
   4797         A list of references to this dataset's blocks.
   4798     \"\"\"
-> 4799     blocks = self._plan.execute().get_blocks()
   4800     self._synchronize_progress_bar()
   4801     return blocks

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/plan.py:591, in ExecutionPlan.execute(self, allow_clear_input_blocks, force_read, preserve_order)
    589 else:
    590     executor = BulkExecutor(copy.deepcopy(context.execution_options))
--> 591 blocks = execute_to_legacy_block_list(
    592     executor,
    593     self,
    594     allow_clear_input_blocks=allow_clear_input_blocks,
    595     dataset_uuid=self._dataset_uuid,
    596     preserve_order=preserve_order,
    597 )
    598 # TODO(ekl) we shouldn't need to set this in the future once we move
    599 # to a fully lazy execution model, unless .materialize() is used. Th
    600 # reason we need it right now is since the user may iterate over a
    601 # Dataset multiple times after fully executing it once.
    602 if not self._run_by_consumer:

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/legacy_compat.py:119, in execute_to_legacy_block_list(executor, plan, allow_clear_input_blocks, dataset_uuid, preserve_order)
    112 dag, stats = _get_execution_dag(
    113     executor,
    114     plan,
    115     allow_clear_input_blocks,
    116     preserve_order,
    117 )
    118 bundles = executor.execute(dag, initial_stats=stats)
--> 119 block_list = _bundles_to_block_list(bundles)
    120 # Set the stats UUID after execution finishes.
    121 _set_stats_uuid_recursive(executor.get_stats(), dataset_uuid)

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/legacy_compat.py:357, in _bundles_to_block_list(bundles)
    355 blocks, metadata = [], []
    356 owns_blocks = True
--> 357 for ref_bundle in bundles:
    358     if not ref_bundle.owns_blocks:
    359         owns_blocks = False

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/interfaces/executor.py:37, in OutputIterator.__next__(self)
     36 def __next__(self) -> RefBundle:
---> 37     return self.get_next()

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/streaming_executor.py:129, in StreamingExecutor.execute.<locals>.StreamIterator.get_next(self, output_split_idx)
    127         raise StopIteration
    128 elif isinstance(item, Exception):
--> 129     raise item
    130 else:
    131     # Otherwise return a concrete RefBundle.
    132     if self._outer._global_info:

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/streaming_executor.py:187, in StreamingExecutor.run(self)
    181 \"\"\"Run the control loop in a helper thread.
    182 
    183 Results are returned via the output node's outqueue.
    184 \"\"\"
    185 try:
    186     # Run scheduling loop until complete.
--> 187     while self._scheduling_loop_step(self._topology) and not self._shutdown:
    188         pass
    189 except Exception as e:
    190     # Propagate it to the result iterator.

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/streaming_executor.py:235, in StreamingExecutor._scheduling_loop_step(self, topology)
    230     logger.get_logger().info(\"Scheduling loop step...\")
    232 # Note: calling process_completed_tasks() is expensive since it incurs
    233 # ray.wait() overhead, so make sure to allow multiple dispatch per call for
    234 # greater parallelism.
--> 235 process_completed_tasks(topology)
    237 # Dispatch as many operators as we can for completed tasks.
    238 limits = self._get_or_refresh_resource_limits()

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/streaming_executor_state.py:333, in process_completed_tasks(topology)
    326     ready, _ = ray.wait(
    327         list(active_tasks.keys()),
    328         num_returns=len(active_tasks),
    329         fetch_local=False,
    330         timeout=0.1,
    331     )
    332     for ref in ready:
--> 333         active_tasks[ref].on_waitable_ready()
    335 # Pull any operator outputs into the streaming op state.
    336 for op, op_state in topology.items():

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py:88, in DataOpTask.on_waitable_ready(self)
     80     meta = ray.get(next(self._streaming_gen))
     81 except StopIteration:
     82     # The generator should always yield 2 values (block and metadata)
     83     # each time. If we get a StopIteration here, it means an error
   (...)
     86     # TODO(hchen): Ray Core should have a better interface for
     87     # detecting and obtaining the exception.
---> 88     ex = ray.get(block_ref)
     89     self._task_done_callback()
     90     raise ex

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/_private/auto_init_hook.py:24, in wrap_auto_init.<locals>.auto_init_wrapper(*args, **kwargs)
     21 @wraps(fn)
     22 def auto_init_wrapper(*args, **kwargs):
     23     auto_init_ray()
---> 24     return fn(*args, **kwargs)

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/_private/client_mode_hook.py:103, in client_mode_hook.<locals>.wrapper(*args, **kwargs)
    101     if func.__name__ != \"init\" or is_client_mode_enabled_by_default:
    102         return getattr(ray, func.__name__)(*args, **kwargs)
--> 103 return func(*args, **kwargs)

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/_private/worker.py:2547, in get(object_refs, timeout)
   2545     worker.core_worker.dump_object_store_memory_usage()
   2546 if isinstance(value, RayTaskError):
-> 2547     raise value.as_instanceof_cause()
   2548 else:
   2549     raise value

RayTaskError(FileNotFoundError): ray::FlatMap(extract_sections)() (pid=153397, ip=192.168.1.82)
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_operator.py\", line 405, in _map_task
    for b_out in map_transformer.apply_transform(iter(blocks), ctx):
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 345, in __call__
    for data in iter:
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 171, in __call__
    yield from self._row_fn(input, ctx)
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 245, in transform_fn
    for out_row in fn(row):
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 119, in fn
    return op_fn(item, *fn_args, **fn_kwargs)
  File \"/tmp/ray/session_2023-10-11_12-45-18_995895_152214/runtime_resources/working_dir_files/_ray_pkg_74b1a494592133c8/rag/data.py\", line 29, in extract_sections
    with open(record[\"path\"], \"r\", encoding=\"utf-8\") as html_file:
FileNotFoundError: [Errno 2] No such file or directory: 'docs.ray.io/en/master/tune.html'"
}```

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions