|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": null, |
| 6 | + "id": "a8f66d95-a9c4-40f1-8cf8-19795653c3f3", |
| 7 | + "metadata": {}, |
| 8 | + "outputs": [], |
| 9 | + "source": [ |
| 10 | + "!pip install sycamore-ai[elasticsearch]\n", |
| 11 | + "# Install the Sycamore document ETL library: https://github.com/aryn-ai/sycamore" |
| 12 | + ] |
| 13 | + }, |
| 14 | + { |
| 15 | + "cell_type": "code", |
| 16 | + "execution_count": null, |
| 17 | + "id": "60b49e1c-7055-4534-ac09-8b7ab45086d4", |
| 18 | + "metadata": {}, |
| 19 | + "outputs": [], |
| 20 | + "source": [ |
| 21 | + "import os\n", |
| 22 | + "import sycamore\n", |
| 23 | + "from sycamore.context import ExecMode\n", |
| 24 | + "from sycamore.transforms.partition import ArynPartitioner\n", |
| 25 | + "from sycamore.transforms.extract_schema import LLMPropertyExtractor\n", |
| 26 | + "from sycamore.transforms.summarize_images import SummarizeImages, LLMImageSummarizer\n", |
| 27 | + "from sycamore.transforms.standardizer import (\n", |
| 28 | + " USStateStandardizer,\n", |
| 29 | + " DateTimeStandardizer,\n", |
| 30 | + " ignore_errors,\n", |
| 31 | + ")\n", |
| 32 | + "from sycamore.transforms.merge_elements import GreedySectionMerger\n", |
| 33 | + "from sycamore.functions.tokenizer import HuggingFaceTokenizer\n", |
| 34 | + "from sycamore.transforms.embed import SentenceTransformerEmbedder\n", |
| 35 | + "from sycamore.llms import OpenAI, OpenAIModels\n", |
| 36 | + "\n", |
| 37 | + "import pyarrow.fs\n", |
| 38 | + "\n", |
| 39 | + "llm = OpenAI(OpenAIModels.GPT_4O_MINI)\n", |
| 40 | + "os.environ[\"ARYN_API_KEY\"] = \"<MY-ARYN-API-KEY>\"\n", |
| 41 | + "\n", |
| 42 | + "paths = [\"s3://aryn-public/ntsb/\"]\n", |
| 43 | + "\n", |
| 44 | + "context = sycamore.init()\n", |
| 45 | + "# Add exec_mode=ExecMode.LOCAL to .init to run without Ray\n", |
| 46 | + "docset = context.read.binary(paths=paths, binary_format=\"pdf\")\n", |
| 47 | + "docset = docset.materialize(\n", |
| 48 | + " path=\"./elasticsearch-tutorial/downloaded-docset\",\n", |
| 49 | + " source_mode=sycamore.MATERIALIZE_USE_STORED,\n", |
| 50 | + ")\n", |
| 51 | + "# Make sure your Aryn token is accessible in the environment variable ARYN_API_KEY\n", |
| 52 | + "partitioned_docset = docset.partition(\n", |
| 53 | + " partitioner=ArynPartitioner(extract_table_structure=True, extract_images=True)\n", |
| 54 | + ").materialize(\n", |
| 55 | + " path=\"./elasticsearch-tutorial/partitioned-docset\",\n", |
| 56 | + " source_mode=sycamore.MATERIALIZE_USE_STORED,\n", |
| 57 | + ")\n", |
| 58 | + "partitioned_docset.execute()" |
| 59 | + ] |
| 60 | + }, |
| 61 | + { |
| 62 | + "cell_type": "code", |
| 63 | + "execution_count": null, |
| 64 | + "id": "a755a09e-1622-400b-8b75-b3bad2981b5f", |
| 65 | + "metadata": {}, |
| 66 | + "outputs": [], |
| 67 | + "source": [ |
| 68 | + "schema = {\n", |
| 69 | + " \"type\": \"object\",\n", |
| 70 | + " \"properties\": {\n", |
| 71 | + " \"accidentNumber\": {\"type\": \"string\"},\n", |
| 72 | + " \"dateAndTime\": {\"type\": \"date\"},\n", |
| 73 | + " \"location\": {\n", |
| 74 | + " \"type\": \"string\",\n", |
| 75 | + " \"description\": \"US State where the incident occured\",\n", |
| 76 | + " },\n", |
| 77 | + " \"aircraft\": {\"type\": \"string\"},\n", |
| 78 | + " \"aircraftDamage\": {\"type\": \"string\"},\n", |
| 79 | + " \"injuries\": {\"type\": \"string\"},\n", |
| 80 | + " \"definingEvent\": {\"type\": \"string\"},\n", |
| 81 | + " },\n", |
| 82 | + " \"required\": [\"accidentNumber\", \"dateAndTime\", \"location\", \"aircraft\"],\n", |
| 83 | + "}\n", |
| 84 | + "\n", |
| 85 | + "schema_name = \"FlightAccidentReport\"\n", |
| 86 | + "property_extractor = LLMPropertyExtractor(\n", |
| 87 | + " llm=llm, num_of_elements=20, schema_name=schema_name, schema=schema\n", |
| 88 | + ")\n", |
| 89 | + "\n", |
| 90 | + "enriched_docset = (\n", |
| 91 | + " partitioned_docset\n", |
| 92 | + " # Extracts the properties based on the schema defined\n", |
| 93 | + " .extract_properties(property_extractor=property_extractor)\n", |
| 94 | + " # Summarizes images that were extracted using an LLM\n", |
| 95 | + " .transform(SummarizeImages, summarizer=LLMImageSummarizer(llm=llm))\n", |
| 96 | + ")\n", |
| 97 | + "\n", |
| 98 | + "formatted_docset = (\n", |
| 99 | + " enriched_docset\n", |
| 100 | + " # Converts state abbreviations to their full names.\n", |
| 101 | + " .map(\n", |
| 102 | + " lambda doc: ignore_errors(\n", |
| 103 | + " doc, USStateStandardizer, [\"properties\", \"entity\", \"location\"]\n", |
| 104 | + " )\n", |
| 105 | + " )\n", |
| 106 | + " # Converts datetime into a common format\n", |
| 107 | + " .map(\n", |
| 108 | + " lambda doc: ignore_errors(\n", |
| 109 | + " doc, DateTimeStandardizer, [\"properties\", \"entity\", \"dateAndTime\"]\n", |
| 110 | + " )\n", |
| 111 | + " )\n", |
| 112 | + ")\n", |
| 113 | + "\n", |
| 114 | + "\n", |
| 115 | + "merger = GreedySectionMerger(\n", |
| 116 | + " tokenizer=HuggingFaceTokenizer(\"sentence-transformers/all-MiniLM-L6-v2\"),\n", |
| 117 | + " max_tokens=512,\n", |
| 118 | + ")\n", |
| 119 | + "chunked_docset = formatted_docset.merge(merger=merger)\n", |
| 120 | + "\n", |
| 121 | + "model_name = \"thenlper/gte-small\"\n", |
| 122 | + "\n", |
| 123 | + "embedded_docset = (\n", |
| 124 | + " chunked_docset.spread_properties([\"entity\", \"path\"])\n", |
| 125 | + " .explode()\n", |
| 126 | + " .embed(\n", |
| 127 | + " embedder=SentenceTransformerEmbedder(batch_size=10_000, model_name=model_name)\n", |
| 128 | + " )\n", |
| 129 | + ")\n", |
| 130 | + "\n", |
| 131 | + "embedded_docset = embedded_docset.materialize(\n", |
| 132 | + " path=\"./elasticsearch-tutorial/embedded-docset\",\n", |
| 133 | + " source_mode=sycamore.MATERIALIZE_USE_STORED,\n", |
| 134 | + ")\n", |
| 135 | + "embedded_docset.execute()" |
| 136 | + ] |
| 137 | + }, |
| 138 | + { |
| 139 | + "cell_type": "code", |
| 140 | + "execution_count": null, |
| 141 | + "id": "b9321d7e-e812-41ac-8030-3db80c2147ec", |
| 142 | + "metadata": {}, |
| 143 | + "outputs": [], |
| 144 | + "source": [ |
| 145 | + "# Write to a persistent Elasticsearch Index. Note: You must have a specified elasticsearch instance running for this to work.\n", |
| 146 | + "# For more information on how to set one up, refer to https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html\n", |
| 147 | + "\n", |
| 148 | + "url = \"http://localhost:9200\"\n", |
| 149 | + "index_name = \"aryn-demo\"\n", |
| 150 | + "embedded_ds.write.elasticsearch(\n", |
| 151 | + " url=url,\n", |
| 152 | + " index_name=index_name,\n", |
| 153 | + " es_client_args={\"basic_auth\": (\"<YOUR-USERNAME>\", os.getenv(\"ELASTIC_PASSWORD\"))},\n", |
| 154 | + " mappings={\n", |
| 155 | + " \"properties\": {\n", |
| 156 | + " \"embeddings\": {\n", |
| 157 | + " \"type\": \"dense_vector\",\n", |
| 158 | + " \"dims\": dimensions,\n", |
| 159 | + " \"index\": True,\n", |
| 160 | + " \"similarity\": \"cosine\",\n", |
| 161 | + " },\n", |
| 162 | + " \"properties\": {\"type\": \"object\"},\n", |
| 163 | + " }\n", |
| 164 | + " },\n", |
| 165 | + ")" |
| 166 | + ] |
| 167 | + }, |
| 168 | + { |
| 169 | + "cell_type": "code", |
| 170 | + "execution_count": null, |
| 171 | + "id": "52970be4-7bac-455b-bcd0-868130ac61fd", |
| 172 | + "metadata": {}, |
| 173 | + "outputs": [], |
| 174 | + "source": [ |
| 175 | + "# Verify data has been loaded using DocSet Query to retrieve chunks\n", |
| 176 | + "query_params = {\"match_all\": {}}\n", |
| 177 | + "query_docs = ctx.read.elasticsearch(\n", |
| 178 | + " url=url,\n", |
| 179 | + " index_name=index_name,\n", |
| 180 | + " query=query_params,\n", |
| 181 | + " es_client_args={\"basic_auth\": (\"<YOUR-USERNAME>\", os.getenv(\"ELASTIC_PASSWORD\"))},\n", |
| 182 | + ")\n", |
| 183 | + "query_docs.show(show_embedding=False)" |
| 184 | + ] |
| 185 | + } |
| 186 | + ], |
| 187 | + "metadata": { |
| 188 | + "kernelspec": { |
| 189 | + "display_name": "Python 3 (ipykernel)", |
| 190 | + "language": "python", |
| 191 | + "name": "python3" |
| 192 | + }, |
| 193 | + "language_info": { |
| 194 | + "codemirror_mode": { |
| 195 | + "name": "ipython", |
| 196 | + "version": 3 |
| 197 | + }, |
| 198 | + "file_extension": ".py", |
| 199 | + "mimetype": "text/x-python", |
| 200 | + "name": "python", |
| 201 | + "nbconvert_exporter": "python", |
| 202 | + "pygments_lexer": "ipython3", |
| 203 | + "version": "3.11.6" |
| 204 | + } |
| 205 | + }, |
| 206 | + "nbformat": 4, |
| 207 | + "nbformat_minor": 5 |
| 208 | +} |
0 commit comments