datastaxdevs
diff --git a/‎AstraDB_langchain_quickstart_1.ipynb
Lines changed: 60 additions & 35 deletions b/‎AstraDB_langchain_quickstart_1.ipynb
Lines changed: 60 additions & 35 deletions
diff --git a/‎README.md
Lines changed: 4 additions & 1 deletion b/‎README.md
Lines changed: 4 additions & 1 deletion
diff --git a/‎integrate_explicit_embeddings.py
Lines changed: 87 additions & 0 deletions b/‎integrate_explicit_embeddings.py
Lines changed: 87 additions & 0 deletions
@@ -122,9 +122,9 @@
    "id": "5a9fef37-ec79-4aa3-8817-5722909effc3",
    "metadata": {},
    "source": [
-    "### additional step for Azure OpenAI\n",
+    "##### Additional step for Azure OpenAI\n",
     "\n",
-    "If you use Microsoft Azure OpenAI, uncomment the following cell and set additional environment variables, editing as needed:\n",
+    "If you use Microsoft Azure OpenAI, uncomment the following cell and edit as needed to set additional environment variables:\n",
     "\n",
     "_(remember the `OPENAI_API_KEY` provided earlier must be appropriate to Azure.)_"
    ]
@@ -161,7 +161,7 @@
     "ASTRA_DB_KEYSPACE = os.environ.get(\"ASTRA_DB_KEYSPACE\") or None\n",
     "ASTRA_DB_API_KEY_NAME = os.environ.get(\"ASTRA_DB_API_KEY_NAME\") or None\n",
     "\n",
-    "OPENAI_API_KEY = os.environ[\"OPENAI_API_KEY\"]"
+    "OPENAI_API_KEY = os.environ.get(\"OPENAI_API_KEY\") or None"
    ]
   },
   {
@@ -184,6 +184,7 @@
    "outputs": [],
    "source": [
     "# Edit if necessary, then run the cell\n",
+    "\n",
     "USE_VECTORIZE = True  # server-side embeddings\n",
     "# USE_VECTORIZE = False  # explicit embeddings"
    ]
@@ -229,12 +230,13 @@
     "        namespace=ASTRA_DB_KEYSPACE,\n",
     "    )\n",
     "\n",
+    "\n",
     "## If you already have a populated vector collection, try this instead\n",
-    "## (and skip the 'load dataset' phase if you are so inclined):\n",
+    "## (and then skip the load+process+insert phases if you are so inclined):\n",
     "\n",
-    "# vector_store =  = AstraDBVectorStore(\n",
+    "# vector_store = AstraDBVectorStore(\n",
     "#     collection_name=\"INSERT_YOUR_COLLECTION_NAME\",\n",
-    "#     embedding=embedding,  # omit for vectorize; otherwise, must match the data on DB\n",
+    "#     embedding=EMBEDDING,  # omit for vectorize; else, must be the same used for the data on DB\n",
     "#     token=ASTRA_DB_APPLICATION_TOKEN,\n",
     "#     api_endpoint=ASTRA_DB_API_ENDPOINT,\n",
     "#     namespace=ASTRA_DB_KEYSPACE,\n",
@@ -249,7 +251,7 @@
    "source": [
     "### Load data\n",
     "\n",
-    "Load a small dataset of phlosophical quotes using the Python `dataset` package."
+    "Load a small dataset of philosophical quotes using the Python `dataset` package."
    ]
   },
   {
@@ -284,17 +286,22 @@
    "source": [
     "documents_to_insert = []\n",
     "\n",
-    "for entry in philo_dataset:\n",
+    "for entry_idx, entry in enumerate(philo_dataset):\n",
     "    metadata = {\"author\": entry[\"author\"]}\n",
     "    if entry[\"tags\"]:\n",
     "        # Add metadata tags to the metadata dictionary\n",
     "        for tag in entry[\"tags\"].split(\";\"):\n",
     "            metadata[tag] = \"y\"\n",
     "    # Construct the Document, with the quote and metadata tags\n",
-    "    new_document = Document(page_content=entry[\"quote\"], metadata=metadata)\n",
+    "    new_document = Document(\n",
+    "        id=f\"{entry['author'][:4]}_{entry_idx:03}\",\n",
+    "        page_content=entry[\"quote\"],\n",
+    "        metadata=metadata,\n",
+    "    )\n",
     "    documents_to_insert.append(new_document)\n",
     "\n",
-    "print(f\"Ready to insert {len(documents_to_insert)} documents.\")"
+    "print(f\"Ready to insert {len(documents_to_insert)} documents.\")\n",
+    "print(f\"Example document: {documents_to_insert[16]}\")"
    ]
   },
   {
@@ -316,7 +323,7 @@
    "source": [
     "inserted_ids = vector_store.add_documents(documents_to_insert)\n",
     "\n",
-    "print(f\"\\nInserted {len(inserted_ids)} documents.\")"
+    "print(f\"\\nInserted {len(inserted_ids)} documents: {', '.join(inserted_ids[:3])} ...\")"
    ]
   },
   {
@@ -357,7 +364,7 @@
    "source": [
     "### Use `add_texts`\n",
     "\n",
-    "Storing entries in the vector store through `add_texts` has the advantage that you can specify the IDs, so that you don't risk duplicating the entries if you run the insertion multiple times."
+    "You can store documents through `add_texts` and supply three parallel lists for the texts, the metadata and the IDs."
    ]
   },
   {
@@ -376,10 +383,10 @@
     "    {\"author\": \"husserl\", \"knowledge\": \"y\"},\n",
     "]\n",
     "ids = [\n",
-    "    \"desc_01\",\n",
-    "    \"huss_xy\",\n",
+    "    \"desc_999\",\n",
+    "    \"huss_888\",\n",
     "]\n",
-    "inserted_ids_2 = vstore.add_texts(texts=texts, metadatas=metadatas, ids=ids)\n",
+    "inserted_ids_2 = vector_store.add_texts(texts=texts, metadatas=metadatas, ids=ids)\n",
     "print(f\"\\nInserted {len(inserted_ids_2)} documents.\")"
    ]
   },
@@ -398,9 +405,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "results = vstore.similarity_search_with_score(\"Our life is what we make of it\", k=3)\n",
+    "results = vector_store.similarity_search_with_score(\"Our life is what we make of it\", k=3)\n",
     "for res, score in results:\n",
-    "    print(f\"* [SIM={score:3f}] {res.page_content} [{res.metadata}]\")"
+    "    print(f\"* [{score:.3f}] {res.page_content} [{res.metadata}]\")"
    ]
   },
   {
@@ -418,7 +425,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "results = vstore.similarity_search(\n",
+    "results = vector_store.similarity_search(\n",
     "    \"Our life is what we make of it\",\n",
     "    k=3,\n",
     "    filter={\"author\": \"aristotle\"},\n",
@@ -442,7 +449,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "results = vstore.max_marginal_relevance_search(\n",
+    "results = vector_store.max_marginal_relevance_search(\n",
     "    \"Our life is what we make of it\",\n",
     "    k=3,\n",
     "    filter={\"author\": \"aristotle\"},\n",
@@ -456,7 +463,7 @@
    "id": "14cb448d-91d1-4edc-8047-adcfa87b5afc",
    "metadata": {},
    "source": [
-    "### Deleting documents from the store"
+    "### Delete documents from the store"
    ]
   },
   {
@@ -474,7 +481,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "delete_1 = vstore.delete(inserted_ids[:3])\n",
+    "delete_1 = vector_store.delete(inserted_ids[:3])\n",
     "print(f\"delete result = {delete_1}\")"
    ]
   },
@@ -485,7 +492,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "delete_2 = vstore.delete(inserted_ids[2:5])\n",
+    "delete_2 = vector_store.delete(inserted_ids[2:5])\n",
     "print(f\"delete result = {delete_2}\")"
    ]
   },
@@ -494,9 +501,9 @@
    "id": "cdc96eda-0047-485b-962c-60fe329ab1b3",
    "metadata": {},
    "source": [
-    "### Retrieve and then delete\n",
+    "#### Retrieve and then delete\n",
     "\n",
-    "Sometimes you do not have the IDs, ... but you might want to run a search and then delete the results for some reason:"
+    "Sometimes you do not have the IDs, ... but you might want to run a search and then delete the results:"
    ]
   },
   {
@@ -507,15 +514,15 @@
    "outputs": [],
    "source": [
     "ids_to_delete = []\n",
-    "for res_doc, res_score, res_id in vstore.similarity_search_with_score_id(\n",
+    "for res_doc, res_score, res_id in vector_store.similarity_search_with_score_id(\n",
     "    \"Philosophy has no goals\",\n",
     "    k=2,\n",
     "):\n",
-    "    print(f\"* [SIM={res_score:3f}] {res_doc.page_content} [{res_doc.metadata}]\")\n",
+    "    print(f\"* [SIM={res_score:.3f}] {res_doc.page_content} [{res_doc.metadata}]\")\n",
     "    ids_to_delete.append(res_id)\n",
     "\n",
-    "print(f\"Deleting IDs = {ids_to_delete} ...\")\n",
-    "success = vstore.delete(ids_to_delete)\n",
+    "print(f\"\\nDeleting IDs = {ids_to_delete} ...\")\n",
+    "success = vector_store.delete(ids_to_delete)\n",
     "print(f\"Deletion succeeded = {success}\")"
    ]
   },
@@ -534,19 +541,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "for res_doc, res_score, res_id in vstore.similarity_search_with_score_id(\n",
+    "for res_doc, res_score, res_id in vector_store.similarity_search_with_score_id(\n",
     "    \"Philosophy has no goals\",\n",
     "    k=2,\n",
     "):\n",
-    "    print(f\"* [SIM={res_score:3f}] {res_doc.page_content} [{res_doc.metadata}]\")"
+    "    print(f\"* [SIM={res_score:.3f}] {res_doc.page_content} [{res_doc.metadata}]\")"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "a8280918-25ea-40f4-82c1-78d0b9b27278",
    "metadata": {},
    "source": [
-    "### Delete the **whole** stored data\n",
+    "#### Delete the **whole** stored data\n",
     "\n",
     "> _Warning: use with caution. Data loss!_"
    ]
@@ -558,7 +565,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "vstore.clear()"
+    "vector_store.clear()"
    ]
   },
   {
@@ -574,7 +581,7 @@
    "id": "2c08d41e-65f8-4096-8b9d-9f12ed91c485",
    "metadata": {},
    "source": [
-    "Let us completely delete the collection, thereby freeing the associated resources on Astra DB:\n",
+    "Completely delete the collection, thereby freeing the associated resources on Astra DB:\n",
     "\n",
     "> _Warning: use with caution. Data loss!_"
    ]
@@ -586,7 +593,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "vstore.delete_collection()"
+    "vector_store.delete_collection()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "41653b13-c903-4e7f-9806-afed3f4d726c",
+   "metadata": {},
+   "source": [
+    "## Next steps"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "08d6c352-cdcc-46a5-9683-376f55d16b45",
+   "metadata": {},
+   "source": [
+    "- [This quickstart on DataStax documentation](https://docs.datastax.com/en/astra-db-serverless/integrations/langchain.html)\n",
+    "- [`AstraDBVectorStore` in LangChain docs](https://python.langchain.com/docs/integrations/providers/astradb/#vector-store)\n",
+    "- [`AstraDBVectorStore`, API Reference](https://python.langchain.com/api_reference/astradb/vectorstores/langchain_astradb.vectorstores.AstraDBVectorStore.html#langchain_astradb.vectorstores.AstraDBVectorStore)"
    ]
   }
  ],
@@ -606,7 +631,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.0"
+   "version": "3.12.8"
   }
  },
  "nbformat": 4,
 
@@ -1,5 +1,7 @@
 # mini-demo-astradb-langchain
 
+For more information, visit the DataStax [Astra DB docs page](https://docs.datastax.com/en/astra-db-serverless/integrations/langchain.html).
+
 [Open in Colab](https://colab.research.google.com/github/datastaxdevs/mini-demo-astradb-langchain/blob/main/AstraDB_langchain_quickstart_1.ipynb)
 
 ## Alternatively, run locally 
@@ -14,7 +16,8 @@ ASTRA_DB_APPLICATION_TOKEN="AstraCS:..."
 
 ASTRA_DB_KEYSPACE="..."             # OPTIONAL
 
-OPENAI_API_KEY="..."                # OPTIONAL (not required with 'vectorize')
+OPENAI_API_KEY="..."                # OPTIONAL (required if using explicit embeddings)
+ASTRA_DB_API_KEY_NAME="..."         # OPTIONAL (required if using 'vectorize')
 ```
 
 Open in Jupyter and run each cell.
 
@@ -0,0 +1,87 @@
+"""
+Required dependencies:
+
+    pip install \
+        "langchain>=0.3,<0.4" \
+        "langchain-astradb>=0.6,<0.7" \
+        "langchain-openai>=0.3,<0.4" \
+        "datasets>=3.5,<4.0"
+
+Requires a `.env` file with environment variables, see `template.env`.
+"""
+
+# Import dependencies
+import os
+from getpass import getpass
+
+from astrapy.info import VectorServiceOptions
+from langchain_astradb import AstraDBVectorStore
+
+from langchain_core.documents import Document
+from langchain_openai import OpenAIEmbeddings
+
+from datasets import load_dataset
+from dotenv import load_dotenv
+
+
+# Load environment variables
+load_dotenv()
+
+ASTRA_DB_APPLICATION_TOKEN = os.environ["ASTRA_DB_APPLICATION_TOKEN"]
+ASTRA_DB_API_ENDPOINT = os.environ["ASTRA_DB_API_ENDPOINT"]
+ASTRA_DB_KEYSPACE = os.environ.get("ASTRA_DB_KEYSPACE") or None
+ASTRA_DB_API_KEY_NAME = os.environ.get("ASTRA_DB_API_KEY_NAME") or None
+
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") or None
+
+
+# Create a vector store
+embedding = OpenAIEmbeddings()
+vector_store = AstraDBVectorStore(
+    collection_name="langchain_integration_demo",
+    embedding=embedding,
+    token=ASTRA_DB_APPLICATION_TOKEN,
+    api_endpoint=ASTRA_DB_API_ENDPOINT,
+    namespace=ASTRA_DB_KEYSPACE,
+)
+
+
+# Load data
+philo_dataset = load_dataset("datastax/philosopher-quotes")["train"]
+
+print("An example entry:")
+print(philo_dataset[16])
+
+
+# Process dataset
+documents_to_insert = []
+
+for entry_idx, entry in enumerate(philo_dataset):
+    metadata = {"author": entry["author"]}
+    if entry["tags"]:
+        # Add metadata tags to the metadata dictionary
+        for tag in entry["tags"].split(";"):
+            metadata[tag] = "y"
+    # Construct the Document, with the quote and metadata tags
+    new_document = Document(
+        id=f"{entry['author'][:4]}_{entry_idx:03}",
+        page_content=entry["quote"],
+        metadata=metadata,
+    )
+    documents_to_insert.append(new_document)
+
+print(f"Ready to insert {len(documents_to_insert)} documents.")
+print(f"Example document: {documents_to_insert[16]}")
+
+
+# Insert documents
+inserted_ids = vector_store.add_documents(documents_to_insert)
+
+print(f"\nInserted {len(inserted_ids)} documents: {', '.join(inserted_ids[:3])} ...")
+
+
+# Verify the integration
+results = vector_store.similarity_search("Our life is what we make of it", k=3)
+
+for res in results:
+    print(f"* {res.page_content} [{res.metadata}]")