Skip to content

Commit dba4058

Browse files
committed
full rework
1 parent f3da73b commit dba4058

File tree

5 files changed

+251
-36
lines changed

5 files changed

+251
-36
lines changed

AstraDB_langchain_quickstart_1.ipynb

Lines changed: 60 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,9 @@
122122
"id": "5a9fef37-ec79-4aa3-8817-5722909effc3",
123123
"metadata": {},
124124
"source": [
125-
"### additional step for Azure OpenAI\n",
125+
"##### Additional step for Azure OpenAI\n",
126126
"\n",
127-
"If you use Microsoft Azure OpenAI, uncomment the following cell and set additional environment variables, editing as needed:\n",
127+
"If you use Microsoft Azure OpenAI, uncomment the following cell and edit as needed to set additional environment variables:\n",
128128
"\n",
129129
"_(remember the `OPENAI_API_KEY` provided earlier must be appropriate to Azure.)_"
130130
]
@@ -161,7 +161,7 @@
161161
"ASTRA_DB_KEYSPACE = os.environ.get(\"ASTRA_DB_KEYSPACE\") or None\n",
162162
"ASTRA_DB_API_KEY_NAME = os.environ.get(\"ASTRA_DB_API_KEY_NAME\") or None\n",
163163
"\n",
164-
"OPENAI_API_KEY = os.environ[\"OPENAI_API_KEY\"]"
164+
"OPENAI_API_KEY = os.environ.get(\"OPENAI_API_KEY\") or None"
165165
]
166166
},
167167
{
@@ -184,6 +184,7 @@
184184
"outputs": [],
185185
"source": [
186186
"# Edit if necessary, then run the cell\n",
187+
"\n",
187188
"USE_VECTORIZE = True # server-side embeddings\n",
188189
"# USE_VECTORIZE = False # explicit embeddings"
189190
]
@@ -229,12 +230,13 @@
229230
" namespace=ASTRA_DB_KEYSPACE,\n",
230231
" )\n",
231232
"\n",
233+
"\n",
232234
"## If you already have a populated vector collection, try this instead\n",
233-
"## (and skip the 'load dataset' phase if you are so inclined):\n",
235+
"## (and then skip the load+process+insert phases if you are so inclined):\n",
234236
"\n",
235-
"# vector_store = = AstraDBVectorStore(\n",
237+
"# vector_store = AstraDBVectorStore(\n",
236238
"# collection_name=\"INSERT_YOUR_COLLECTION_NAME\",\n",
237-
"# embedding=embedding, # omit for vectorize; otherwise, must match the data on DB\n",
239+
"# embedding=EMBEDDING, # omit for vectorize; else, must be the same used for the data on DB\n",
238240
"# token=ASTRA_DB_APPLICATION_TOKEN,\n",
239241
"# api_endpoint=ASTRA_DB_API_ENDPOINT,\n",
240242
"# namespace=ASTRA_DB_KEYSPACE,\n",
@@ -249,7 +251,7 @@
249251
"source": [
250252
"### Load data\n",
251253
"\n",
252-
"Load a small dataset of phlosophical quotes using the Python `dataset` package."
254+
"Load a small dataset of philosophical quotes using the Python `dataset` package."
253255
]
254256
},
255257
{
@@ -284,17 +286,22 @@
284286
"source": [
285287
"documents_to_insert = []\n",
286288
"\n",
287-
"for entry in philo_dataset:\n",
289+
"for entry_idx, entry in enumerate(philo_dataset):\n",
288290
" metadata = {\"author\": entry[\"author\"]}\n",
289291
" if entry[\"tags\"]:\n",
290292
" # Add metadata tags to the metadata dictionary\n",
291293
" for tag in entry[\"tags\"].split(\";\"):\n",
292294
" metadata[tag] = \"y\"\n",
293295
" # Construct the Document, with the quote and metadata tags\n",
294-
" new_document = Document(page_content=entry[\"quote\"], metadata=metadata)\n",
296+
" new_document = Document(\n",
297+
" id=f\"{entry['author'][:4]}_{entry_idx:03}\",\n",
298+
" page_content=entry[\"quote\"],\n",
299+
" metadata=metadata,\n",
300+
" )\n",
295301
" documents_to_insert.append(new_document)\n",
296302
"\n",
297-
"print(f\"Ready to insert {len(documents_to_insert)} documents.\")"
303+
"print(f\"Ready to insert {len(documents_to_insert)} documents.\")\n",
304+
"print(f\"Example document: {documents_to_insert[16]}\")"
298305
]
299306
},
300307
{
@@ -316,7 +323,7 @@
316323
"source": [
317324
"inserted_ids = vector_store.add_documents(documents_to_insert)\n",
318325
"\n",
319-
"print(f\"\\nInserted {len(inserted_ids)} documents.\")"
326+
"print(f\"\\nInserted {len(inserted_ids)} documents: {', '.join(inserted_ids[:3])} ...\")"
320327
]
321328
},
322329
{
@@ -357,7 +364,7 @@
357364
"source": [
358365
"### Use `add_texts`\n",
359366
"\n",
360-
"Storing entries in the vector store through `add_texts` has the advantage that you can specify the IDs, so that you don't risk duplicating the entries if you run the insertion multiple times."
367+
"You can store documents through `add_texts` and supply three parallel lists for the texts, the metadata and the IDs."
361368
]
362369
},
363370
{
@@ -376,10 +383,10 @@
376383
" {\"author\": \"husserl\", \"knowledge\": \"y\"},\n",
377384
"]\n",
378385
"ids = [\n",
379-
" \"desc_01\",\n",
380-
" \"huss_xy\",\n",
386+
" \"desc_999\",\n",
387+
" \"huss_888\",\n",
381388
"]\n",
382-
"inserted_ids_2 = vstore.add_texts(texts=texts, metadatas=metadatas, ids=ids)\n",
389+
"inserted_ids_2 = vector_store.add_texts(texts=texts, metadatas=metadatas, ids=ids)\n",
383390
"print(f\"\\nInserted {len(inserted_ids_2)} documents.\")"
384391
]
385392
},
@@ -398,9 +405,9 @@
398405
"metadata": {},
399406
"outputs": [],
400407
"source": [
401-
"results = vstore.similarity_search_with_score(\"Our life is what we make of it\", k=3)\n",
408+
"results = vector_store.similarity_search_with_score(\"Our life is what we make of it\", k=3)\n",
402409
"for res, score in results:\n",
403-
" print(f\"* [SIM={score:3f}] {res.page_content} [{res.metadata}]\")"
410+
" print(f\"* [{score:.3f}] {res.page_content} [{res.metadata}]\")"
404411
]
405412
},
406413
{
@@ -418,7 +425,7 @@
418425
"metadata": {},
419426
"outputs": [],
420427
"source": [
421-
"results = vstore.similarity_search(\n",
428+
"results = vector_store.similarity_search(\n",
422429
" \"Our life is what we make of it\",\n",
423430
" k=3,\n",
424431
" filter={\"author\": \"aristotle\"},\n",
@@ -442,7 +449,7 @@
442449
"metadata": {},
443450
"outputs": [],
444451
"source": [
445-
"results = vstore.max_marginal_relevance_search(\n",
452+
"results = vector_store.max_marginal_relevance_search(\n",
446453
" \"Our life is what we make of it\",\n",
447454
" k=3,\n",
448455
" filter={\"author\": \"aristotle\"},\n",
@@ -456,7 +463,7 @@
456463
"id": "14cb448d-91d1-4edc-8047-adcfa87b5afc",
457464
"metadata": {},
458465
"source": [
459-
"### Deleting documents from the store"
466+
"### Delete documents from the store"
460467
]
461468
},
462469
{
@@ -474,7 +481,7 @@
474481
"metadata": {},
475482
"outputs": [],
476483
"source": [
477-
"delete_1 = vstore.delete(inserted_ids[:3])\n",
484+
"delete_1 = vector_store.delete(inserted_ids[:3])\n",
478485
"print(f\"delete result = {delete_1}\")"
479486
]
480487
},
@@ -485,7 +492,7 @@
485492
"metadata": {},
486493
"outputs": [],
487494
"source": [
488-
"delete_2 = vstore.delete(inserted_ids[2:5])\n",
495+
"delete_2 = vector_store.delete(inserted_ids[2:5])\n",
489496
"print(f\"delete result = {delete_2}\")"
490497
]
491498
},
@@ -494,9 +501,9 @@
494501
"id": "cdc96eda-0047-485b-962c-60fe329ab1b3",
495502
"metadata": {},
496503
"source": [
497-
"### Retrieve and then delete\n",
504+
"#### Retrieve and then delete\n",
498505
"\n",
499-
"Sometimes you do not have the IDs, ... but you might want to run a search and then delete the results for some reason:"
506+
"Sometimes you do not have the IDs, ... but you might want to run a search and then delete the results:"
500507
]
501508
},
502509
{
@@ -507,15 +514,15 @@
507514
"outputs": [],
508515
"source": [
509516
"ids_to_delete = []\n",
510-
"for res_doc, res_score, res_id in vstore.similarity_search_with_score_id(\n",
517+
"for res_doc, res_score, res_id in vector_store.similarity_search_with_score_id(\n",
511518
" \"Philosophy has no goals\",\n",
512519
" k=2,\n",
513520
"):\n",
514-
" print(f\"* [SIM={res_score:3f}] {res_doc.page_content} [{res_doc.metadata}]\")\n",
521+
" print(f\"* [SIM={res_score:.3f}] {res_doc.page_content} [{res_doc.metadata}]\")\n",
515522
" ids_to_delete.append(res_id)\n",
516523
"\n",
517-
"print(f\"Deleting IDs = {ids_to_delete} ...\")\n",
518-
"success = vstore.delete(ids_to_delete)\n",
524+
"print(f\"\\nDeleting IDs = {ids_to_delete} ...\")\n",
525+
"success = vector_store.delete(ids_to_delete)\n",
519526
"print(f\"Deletion succeeded = {success}\")"
520527
]
521528
},
@@ -534,19 +541,19 @@
534541
"metadata": {},
535542
"outputs": [],
536543
"source": [
537-
"for res_doc, res_score, res_id in vstore.similarity_search_with_score_id(\n",
544+
"for res_doc, res_score, res_id in vector_store.similarity_search_with_score_id(\n",
538545
" \"Philosophy has no goals\",\n",
539546
" k=2,\n",
540547
"):\n",
541-
" print(f\"* [SIM={res_score:3f}] {res_doc.page_content} [{res_doc.metadata}]\")"
548+
" print(f\"* [SIM={res_score:.3f}] {res_doc.page_content} [{res_doc.metadata}]\")"
542549
]
543550
},
544551
{
545552
"cell_type": "markdown",
546553
"id": "a8280918-25ea-40f4-82c1-78d0b9b27278",
547554
"metadata": {},
548555
"source": [
549-
"### Delete the **whole** stored data\n",
556+
"#### Delete the **whole** stored data\n",
550557
"\n",
551558
"> _Warning: use with caution. Data loss!_"
552559
]
@@ -558,7 +565,7 @@
558565
"metadata": {},
559566
"outputs": [],
560567
"source": [
561-
"vstore.clear()"
568+
"vector_store.clear()"
562569
]
563570
},
564571
{
@@ -574,7 +581,7 @@
574581
"id": "2c08d41e-65f8-4096-8b9d-9f12ed91c485",
575582
"metadata": {},
576583
"source": [
577-
"Let us completely delete the collection, thereby freeing the associated resources on Astra DB:\n",
584+
"Completely delete the collection, thereby freeing the associated resources on Astra DB:\n",
578585
"\n",
579586
"> _Warning: use with caution. Data loss!_"
580587
]
@@ -586,7 +593,25 @@
586593
"metadata": {},
587594
"outputs": [],
588595
"source": [
589-
"vstore.delete_collection()"
596+
"vector_store.delete_collection()"
597+
]
598+
},
599+
{
600+
"cell_type": "markdown",
601+
"id": "41653b13-c903-4e7f-9806-afed3f4d726c",
602+
"metadata": {},
603+
"source": [
604+
"## Next steps"
605+
]
606+
},
607+
{
608+
"cell_type": "markdown",
609+
"id": "08d6c352-cdcc-46a5-9683-376f55d16b45",
610+
"metadata": {},
611+
"source": [
612+
"- [This quickstart on DataStax documentation](https://docs.datastax.com/en/astra-db-serverless/integrations/langchain.html)\n",
613+
"- [`AstraDBVectorStore` in LangChain docs](https://python.langchain.com/docs/integrations/providers/astradb/#vector-store)\n",
614+
"- [`AstraDBVectorStore`, API Reference](https://python.langchain.com/api_reference/astradb/vectorstores/langchain_astradb.vectorstores.AstraDBVectorStore.html#langchain_astradb.vectorstores.AstraDBVectorStore)"
590615
]
591616
}
592617
],
@@ -606,7 +631,7 @@
606631
"name": "python",
607632
"nbconvert_exporter": "python",
608633
"pygments_lexer": "ipython3",
609-
"version": "3.12.0"
634+
"version": "3.12.8"
610635
}
611636
},
612637
"nbformat": 4,

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# mini-demo-astradb-langchain
22

3+
For more information, visit the DataStax [Astra DB docs page](https://docs.datastax.com/en/astra-db-serverless/integrations/langchain.html).
4+
35
[Open in Colab](https://colab.research.google.com/github/datastaxdevs/mini-demo-astradb-langchain/blob/main/AstraDB_langchain_quickstart_1.ipynb)
46

57
## Alternatively, run locally
@@ -14,7 +16,8 @@ ASTRA_DB_APPLICATION_TOKEN="AstraCS:..."
1416
1517
ASTRA_DB_KEYSPACE="..." # OPTIONAL
1618
17-
OPENAI_API_KEY="..." # OPTIONAL (not required with 'vectorize')
19+
OPENAI_API_KEY="..." # OPTIONAL (required if using explicit embeddings)
20+
ASTRA_DB_API_KEY_NAME="..." # OPTIONAL (required if using 'vectorize')
1821
```
1922

2023
Open in Jupyter and run each cell.

integrate_explicit_embeddings.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
"""
2+
Required dependencies:
3+
4+
pip install \
5+
"langchain>=0.3,<0.4" \
6+
"langchain-astradb>=0.6,<0.7" \
7+
"langchain-openai>=0.3,<0.4" \
8+
"datasets>=3.5,<4.0"
9+
10+
Requires a `.env` file with environment variables, see `template.env`.
11+
"""
12+
13+
# Import dependencies
14+
import os
15+
from getpass import getpass
16+
17+
from astrapy.info import VectorServiceOptions
18+
from langchain_astradb import AstraDBVectorStore
19+
20+
from langchain_core.documents import Document
21+
from langchain_openai import OpenAIEmbeddings
22+
23+
from datasets import load_dataset
24+
from dotenv import load_dotenv
25+
26+
27+
# Load environment variables
28+
load_dotenv()
29+
30+
ASTRA_DB_APPLICATION_TOKEN = os.environ["ASTRA_DB_APPLICATION_TOKEN"]
31+
ASTRA_DB_API_ENDPOINT = os.environ["ASTRA_DB_API_ENDPOINT"]
32+
ASTRA_DB_KEYSPACE = os.environ.get("ASTRA_DB_KEYSPACE") or None
33+
ASTRA_DB_API_KEY_NAME = os.environ.get("ASTRA_DB_API_KEY_NAME") or None
34+
35+
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") or None
36+
37+
38+
# Create a vector store
39+
embedding = OpenAIEmbeddings()
40+
vector_store = AstraDBVectorStore(
41+
collection_name="langchain_integration_demo",
42+
embedding=embedding,
43+
token=ASTRA_DB_APPLICATION_TOKEN,
44+
api_endpoint=ASTRA_DB_API_ENDPOINT,
45+
namespace=ASTRA_DB_KEYSPACE,
46+
)
47+
48+
49+
# Load data
50+
philo_dataset = load_dataset("datastax/philosopher-quotes")["train"]
51+
52+
print("An example entry:")
53+
print(philo_dataset[16])
54+
55+
56+
# Process dataset
57+
documents_to_insert = []
58+
59+
for entry_idx, entry in enumerate(philo_dataset):
60+
metadata = {"author": entry["author"]}
61+
if entry["tags"]:
62+
# Add metadata tags to the metadata dictionary
63+
for tag in entry["tags"].split(";"):
64+
metadata[tag] = "y"
65+
# Construct the Document, with the quote and metadata tags
66+
new_document = Document(
67+
id=f"{entry['author'][:4]}_{entry_idx:03}",
68+
page_content=entry["quote"],
69+
metadata=metadata,
70+
)
71+
documents_to_insert.append(new_document)
72+
73+
print(f"Ready to insert {len(documents_to_insert)} documents.")
74+
print(f"Example document: {documents_to_insert[16]}")
75+
76+
77+
# Insert documents
78+
inserted_ids = vector_store.add_documents(documents_to_insert)
79+
80+
print(f"\nInserted {len(inserted_ids)} documents: {', '.join(inserted_ids[:3])} ...")
81+
82+
83+
# Verify the integration
84+
results = vector_store.similarity_search("Our life is what we make of it", k=3)
85+
86+
for res in results:
87+
print(f"* {res.page_content} [{res.metadata}]")

0 commit comments

Comments
 (0)