Skip to content

Commit e3432ce

Browse files
jonfritzHenryL27
andauthored
Add notebook for Aryn blog post (#380)
* Create test * Add files via upload * Rename aryn-elasticsearch-blog-demo-clean copy.ipynb to aryn-elasticsearch-blog-dataprep.ipynb * Create README.md * Delete notebooks/integrations/aryn/test * Add files via upload Update with pip install * Update aryn-elasticsearch-blog-dataprep.ipynb Update from feedback * Update aryn-elasticsearch-blog-dataprep.ipynb Add placeholder API key * Update aryn-elasticsearch-blog-dataprep.ipynb Add comma * Update aryn-elasticsearch-blog-dataprep.ipynb Formatting * Update aryn-elasticsearch-blog-dataprep.ipynb Formatting * Update aryn-elasticsearch-blog-dataprep.ipynb Add API key placeholder * Update aryn-elasticsearch-blog-dataprep.ipynb Formatting * fix formatting Signed-off-by: Henry Lindeman <[email protected]> * Create aryn-elasticsearch-RAG-data-preparation-demo * Rename supporting-blog-content/aryn-elasticsearch-RAG-data-preparation-demo to supporting-blog-content/Aryn-elasticsearch-RAG-data-preparation-demo/test.md * Add files via upload * Delete supporting-blog-content/Aryn-elasticsearch-RAG-data-preparation-demo/test.md * Delete notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb * Update README.md Add link --------- Signed-off-by: Henry Lindeman <[email protected]> Co-authored-by: Henry Lindeman <[email protected]>
1 parent a59ab5a commit e3432ce

File tree

2 files changed

+211
-0
lines changed

2 files changed

+211
-0
lines changed

notebooks/integrations/aryn/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
This folder contains examples showing how to prepare data using Aryn Sycamore and load into Elasticsearch for RAG and GenAI use cases.
2+
3+
The notebook for the Aryn Elasticsearch blog example is [here](https://github.com/elastic/elasticsearch-labs/blob/main/supporting-blog-content/Aryn-elasticsearch-RAG-data-preparation-demo/aryn-elasticsearch-blog-dataprep.ipynb).
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "a8f66d95-a9c4-40f1-8cf8-19795653c3f3",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"!pip install sycamore-ai[elasticsearch]\n",
11+
"# Install the Sycamore document ETL library: https://github.com/aryn-ai/sycamore"
12+
]
13+
},
14+
{
15+
"cell_type": "code",
16+
"execution_count": null,
17+
"id": "60b49e1c-7055-4534-ac09-8b7ab45086d4",
18+
"metadata": {},
19+
"outputs": [],
20+
"source": [
21+
"import os\n",
22+
"import sycamore\n",
23+
"from sycamore.context import ExecMode\n",
24+
"from sycamore.transforms.partition import ArynPartitioner\n",
25+
"from sycamore.transforms.extract_schema import LLMPropertyExtractor\n",
26+
"from sycamore.transforms.summarize_images import SummarizeImages, LLMImageSummarizer\n",
27+
"from sycamore.transforms.standardizer import (\n",
28+
" USStateStandardizer,\n",
29+
" DateTimeStandardizer,\n",
30+
" ignore_errors,\n",
31+
")\n",
32+
"from sycamore.transforms.merge_elements import GreedySectionMerger\n",
33+
"from sycamore.functions.tokenizer import HuggingFaceTokenizer\n",
34+
"from sycamore.transforms.embed import SentenceTransformerEmbedder\n",
35+
"from sycamore.llms import OpenAI, OpenAIModels\n",
36+
"\n",
37+
"import pyarrow.fs\n",
38+
"\n",
39+
"llm = OpenAI(OpenAIModels.GPT_4O_MINI)\n",
40+
"os.environ[\"ARYN_API_KEY\"] = \"<MY-ARYN-API-KEY>\"\n",
41+
"\n",
42+
"paths = [\"s3://aryn-public/ntsb/\"]\n",
43+
"\n",
44+
"context = sycamore.init()\n",
45+
"# Add exec_mode=ExecMode.LOCAL to .init to run without Ray\n",
46+
"docset = context.read.binary(paths=paths, binary_format=\"pdf\")\n",
47+
"docset = docset.materialize(\n",
48+
" path=\"./elasticsearch-tutorial/downloaded-docset\",\n",
49+
" source_mode=sycamore.MATERIALIZE_USE_STORED,\n",
50+
")\n",
51+
"# Make sure your Aryn token is accessible in the environment variable ARYN_API_KEY\n",
52+
"partitioned_docset = docset.partition(\n",
53+
" partitioner=ArynPartitioner(extract_table_structure=True, extract_images=True)\n",
54+
").materialize(\n",
55+
" path=\"./elasticsearch-tutorial/partitioned-docset\",\n",
56+
" source_mode=sycamore.MATERIALIZE_USE_STORED,\n",
57+
")\n",
58+
"partitioned_docset.execute()"
59+
]
60+
},
61+
{
62+
"cell_type": "code",
63+
"execution_count": null,
64+
"id": "a755a09e-1622-400b-8b75-b3bad2981b5f",
65+
"metadata": {},
66+
"outputs": [],
67+
"source": [
68+
"schema = {\n",
69+
" \"type\": \"object\",\n",
70+
" \"properties\": {\n",
71+
" \"accidentNumber\": {\"type\": \"string\"},\n",
72+
" \"dateAndTime\": {\"type\": \"date\"},\n",
73+
" \"location\": {\n",
74+
" \"type\": \"string\",\n",
75+
" \"description\": \"US State where the incident occured\",\n",
76+
" },\n",
77+
" \"aircraft\": {\"type\": \"string\"},\n",
78+
" \"aircraftDamage\": {\"type\": \"string\"},\n",
79+
" \"injuries\": {\"type\": \"string\"},\n",
80+
" \"definingEvent\": {\"type\": \"string\"},\n",
81+
" },\n",
82+
" \"required\": [\"accidentNumber\", \"dateAndTime\", \"location\", \"aircraft\"],\n",
83+
"}\n",
84+
"\n",
85+
"schema_name = \"FlightAccidentReport\"\n",
86+
"property_extractor = LLMPropertyExtractor(\n",
87+
" llm=llm, num_of_elements=20, schema_name=schema_name, schema=schema\n",
88+
")\n",
89+
"\n",
90+
"enriched_docset = (\n",
91+
" partitioned_docset\n",
92+
" # Extracts the properties based on the schema defined\n",
93+
" .extract_properties(property_extractor=property_extractor)\n",
94+
" # Summarizes images that were extracted using an LLM\n",
95+
" .transform(SummarizeImages, summarizer=LLMImageSummarizer(llm=llm))\n",
96+
")\n",
97+
"\n",
98+
"formatted_docset = (\n",
99+
" enriched_docset\n",
100+
" # Converts state abbreviations to their full names.\n",
101+
" .map(\n",
102+
" lambda doc: ignore_errors(\n",
103+
" doc, USStateStandardizer, [\"properties\", \"entity\", \"location\"]\n",
104+
" )\n",
105+
" )\n",
106+
" # Converts datetime into a common format\n",
107+
" .map(\n",
108+
" lambda doc: ignore_errors(\n",
109+
" doc, DateTimeStandardizer, [\"properties\", \"entity\", \"dateAndTime\"]\n",
110+
" )\n",
111+
" )\n",
112+
")\n",
113+
"\n",
114+
"\n",
115+
"merger = GreedySectionMerger(\n",
116+
" tokenizer=HuggingFaceTokenizer(\"sentence-transformers/all-MiniLM-L6-v2\"),\n",
117+
" max_tokens=512,\n",
118+
")\n",
119+
"chunked_docset = formatted_docset.merge(merger=merger)\n",
120+
"\n",
121+
"model_name = \"thenlper/gte-small\"\n",
122+
"\n",
123+
"embedded_docset = (\n",
124+
" chunked_docset.spread_properties([\"entity\", \"path\"])\n",
125+
" .explode()\n",
126+
" .embed(\n",
127+
" embedder=SentenceTransformerEmbedder(batch_size=10_000, model_name=model_name)\n",
128+
" )\n",
129+
")\n",
130+
"\n",
131+
"embedded_docset = embedded_docset.materialize(\n",
132+
" path=\"./elasticsearch-tutorial/embedded-docset\",\n",
133+
" source_mode=sycamore.MATERIALIZE_USE_STORED,\n",
134+
")\n",
135+
"embedded_docset.execute()"
136+
]
137+
},
138+
{
139+
"cell_type": "code",
140+
"execution_count": null,
141+
"id": "b9321d7e-e812-41ac-8030-3db80c2147ec",
142+
"metadata": {},
143+
"outputs": [],
144+
"source": [
145+
"# Write to a persistent Elasticsearch Index. Note: You must have a specified elasticsearch instance running for this to work.\n",
146+
"# For more information on how to set one up, refer to https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html\n",
147+
"\n",
148+
"url = \"http://localhost:9200\"\n",
149+
"index_name = \"aryn-demo\"\n",
150+
"embedded_ds.write.elasticsearch(\n",
151+
" url=url,\n",
152+
" index_name=index_name,\n",
153+
" es_client_args={\"basic_auth\": (\"<YOUR-USERNAME>\", os.getenv(\"ELASTIC_PASSWORD\"))},\n",
154+
" mappings={\n",
155+
" \"properties\": {\n",
156+
" \"embeddings\": {\n",
157+
" \"type\": \"dense_vector\",\n",
158+
" \"dims\": dimensions,\n",
159+
" \"index\": True,\n",
160+
" \"similarity\": \"cosine\",\n",
161+
" },\n",
162+
" \"properties\": {\"type\": \"object\"},\n",
163+
" }\n",
164+
" },\n",
165+
")"
166+
]
167+
},
168+
{
169+
"cell_type": "code",
170+
"execution_count": null,
171+
"id": "52970be4-7bac-455b-bcd0-868130ac61fd",
172+
"metadata": {},
173+
"outputs": [],
174+
"source": [
175+
"# Verify data has been loaded using DocSet Query to retrieve chunks\n",
176+
"query_params = {\"match_all\": {}}\n",
177+
"query_docs = ctx.read.elasticsearch(\n",
178+
" url=url,\n",
179+
" index_name=index_name,\n",
180+
" query=query_params,\n",
181+
" es_client_args={\"basic_auth\": (\"<YOUR-USERNAME>\", os.getenv(\"ELASTIC_PASSWORD\"))},\n",
182+
")\n",
183+
"query_docs.show(show_embedding=False)"
184+
]
185+
}
186+
],
187+
"metadata": {
188+
"kernelspec": {
189+
"display_name": "Python 3 (ipykernel)",
190+
"language": "python",
191+
"name": "python3"
192+
},
193+
"language_info": {
194+
"codemirror_mode": {
195+
"name": "ipython",
196+
"version": 3
197+
},
198+
"file_extension": ".py",
199+
"mimetype": "text/x-python",
200+
"name": "python",
201+
"nbconvert_exporter": "python",
202+
"pygments_lexer": "ipython3",
203+
"version": "3.11.6"
204+
}
205+
},
206+
"nbformat": 4,
207+
"nbformat_minor": 5
208+
}

0 commit comments

Comments
 (0)