|
122 | 122 | "id": "5a9fef37-ec79-4aa3-8817-5722909effc3",
|
123 | 123 | "metadata": {},
|
124 | 124 | "source": [
|
125 |
| - "### additional step for Azure OpenAI\n", |
| 125 | + "##### Additional step for Azure OpenAI\n", |
126 | 126 | "\n",
|
127 |
| - "If you use Microsoft Azure OpenAI, uncomment the following cell and set additional environment variables, editing as needed:\n", |
| 127 | + "If you use Microsoft Azure OpenAI, uncomment the following cell and edit as needed to set additional environment variables:\n", |
128 | 128 | "\n",
|
129 | 129 | "_(remember the `OPENAI_API_KEY` provided earlier must be appropriate to Azure.)_"
|
130 | 130 | ]
|
|
161 | 161 | "ASTRA_DB_KEYSPACE = os.environ.get(\"ASTRA_DB_KEYSPACE\") or None\n",
|
162 | 162 | "ASTRA_DB_API_KEY_NAME = os.environ.get(\"ASTRA_DB_API_KEY_NAME\") or None\n",
|
163 | 163 | "\n",
|
164 |
| - "OPENAI_API_KEY = os.environ[\"OPENAI_API_KEY\"]" |
| 164 | + "OPENAI_API_KEY = os.environ.get(\"OPENAI_API_KEY\") or None" |
165 | 165 | ]
|
166 | 166 | },
|
167 | 167 | {
|
|
184 | 184 | "outputs": [],
|
185 | 185 | "source": [
|
186 | 186 | "# Edit if necessary, then run the cell\n",
|
| 187 | + "\n", |
187 | 188 | "USE_VECTORIZE = True # server-side embeddings\n",
|
188 | 189 | "# USE_VECTORIZE = False # explicit embeddings"
|
189 | 190 | ]
|
|
229 | 230 | " namespace=ASTRA_DB_KEYSPACE,\n",
|
230 | 231 | " )\n",
|
231 | 232 | "\n",
|
| 233 | + "\n", |
232 | 234 | "## If you already have a populated vector collection, try this instead\n",
|
233 |
| - "## (and skip the 'load dataset' phase if you are so inclined):\n", |
| 235 | + "## (and then skip the load+process+insert phases if you are so inclined):\n", |
234 | 236 | "\n",
|
235 |
| - "# vector_store = = AstraDBVectorStore(\n", |
| 237 | + "# vector_store = AstraDBVectorStore(\n", |
236 | 238 | "# collection_name=\"INSERT_YOUR_COLLECTION_NAME\",\n",
|
237 |
| - "# embedding=embedding, # omit for vectorize; otherwise, must match the data on DB\n", |
| 239 | + "# embedding=EMBEDDING, # omit for vectorize; else, must be the same used for the data on DB\n", |
238 | 240 | "# token=ASTRA_DB_APPLICATION_TOKEN,\n",
|
239 | 241 | "# api_endpoint=ASTRA_DB_API_ENDPOINT,\n",
|
240 | 242 | "# namespace=ASTRA_DB_KEYSPACE,\n",
|
|
249 | 251 | "source": [
|
250 | 252 | "### Load data\n",
|
251 | 253 | "\n",
|
252 |
| - "Load a small dataset of phlosophical quotes using the Python `dataset` package." |
| 254 | + "Load a small dataset of philosophical quotes using the Python `dataset` package." |
253 | 255 | ]
|
254 | 256 | },
|
255 | 257 | {
|
|
284 | 286 | "source": [
|
285 | 287 | "documents_to_insert = []\n",
|
286 | 288 | "\n",
|
287 |
| - "for entry in philo_dataset:\n", |
| 289 | + "for entry_idx, entry in enumerate(philo_dataset):\n", |
288 | 290 | " metadata = {\"author\": entry[\"author\"]}\n",
|
289 | 291 | " if entry[\"tags\"]:\n",
|
290 | 292 | " # Add metadata tags to the metadata dictionary\n",
|
291 | 293 | " for tag in entry[\"tags\"].split(\";\"):\n",
|
292 | 294 | " metadata[tag] = \"y\"\n",
|
293 | 295 | " # Construct the Document, with the quote and metadata tags\n",
|
294 |
| - " new_document = Document(page_content=entry[\"quote\"], metadata=metadata)\n", |
| 296 | + " new_document = Document(\n", |
| 297 | + " id=f\"{entry['author'][:4]}_{entry_idx:03}\",\n", |
| 298 | + " page_content=entry[\"quote\"],\n", |
| 299 | + " metadata=metadata,\n", |
| 300 | + " )\n", |
295 | 301 | " documents_to_insert.append(new_document)\n",
|
296 | 302 | "\n",
|
297 |
| - "print(f\"Ready to insert {len(documents_to_insert)} documents.\")" |
| 303 | + "print(f\"Ready to insert {len(documents_to_insert)} documents.\")\n", |
| 304 | + "print(f\"Example document: {documents_to_insert[16]}\")" |
298 | 305 | ]
|
299 | 306 | },
|
300 | 307 | {
|
|
316 | 323 | "source": [
|
317 | 324 | "inserted_ids = vector_store.add_documents(documents_to_insert)\n",
|
318 | 325 | "\n",
|
319 |
| - "print(f\"\\nInserted {len(inserted_ids)} documents.\")" |
| 326 | + "print(f\"\\nInserted {len(inserted_ids)} documents: {', '.join(inserted_ids[:3])} ...\")" |
320 | 327 | ]
|
321 | 328 | },
|
322 | 329 | {
|
|
357 | 364 | "source": [
|
358 | 365 | "### Use `add_texts`\n",
|
359 | 366 | "\n",
|
360 |
| - "Storing entries in the vector store through `add_texts` has the advantage that you can specify the IDs, so that you don't risk duplicating the entries if you run the insertion multiple times." |
| 367 | + "You can store documents through `add_texts` and supply three parallel lists for the texts, the metadata and the IDs." |
361 | 368 | ]
|
362 | 369 | },
|
363 | 370 | {
|
|
376 | 383 | " {\"author\": \"husserl\", \"knowledge\": \"y\"},\n",
|
377 | 384 | "]\n",
|
378 | 385 | "ids = [\n",
|
379 |
| - " \"desc_01\",\n", |
380 |
| - " \"huss_xy\",\n", |
| 386 | + " \"desc_999\",\n", |
| 387 | + " \"huss_888\",\n", |
381 | 388 | "]\n",
|
382 |
| - "inserted_ids_2 = vstore.add_texts(texts=texts, metadatas=metadatas, ids=ids)\n", |
| 389 | + "inserted_ids_2 = vector_store.add_texts(texts=texts, metadatas=metadatas, ids=ids)\n", |
383 | 390 | "print(f\"\\nInserted {len(inserted_ids_2)} documents.\")"
|
384 | 391 | ]
|
385 | 392 | },
|
|
398 | 405 | "metadata": {},
|
399 | 406 | "outputs": [],
|
400 | 407 | "source": [
|
401 |
| - "results = vstore.similarity_search_with_score(\"Our life is what we make of it\", k=3)\n", |
| 408 | + "results = vector_store.similarity_search_with_score(\"Our life is what we make of it\", k=3)\n", |
402 | 409 | "for res, score in results:\n",
|
403 |
| - " print(f\"* [SIM={score:3f}] {res.page_content} [{res.metadata}]\")" |
| 410 | + " print(f\"* [{score:.3f}] {res.page_content} [{res.metadata}]\")" |
404 | 411 | ]
|
405 | 412 | },
|
406 | 413 | {
|
|
418 | 425 | "metadata": {},
|
419 | 426 | "outputs": [],
|
420 | 427 | "source": [
|
421 |
| - "results = vstore.similarity_search(\n", |
| 428 | + "results = vector_store.similarity_search(\n", |
422 | 429 | " \"Our life is what we make of it\",\n",
|
423 | 430 | " k=3,\n",
|
424 | 431 | " filter={\"author\": \"aristotle\"},\n",
|
|
442 | 449 | "metadata": {},
|
443 | 450 | "outputs": [],
|
444 | 451 | "source": [
|
445 |
| - "results = vstore.max_marginal_relevance_search(\n", |
| 452 | + "results = vector_store.max_marginal_relevance_search(\n", |
446 | 453 | " \"Our life is what we make of it\",\n",
|
447 | 454 | " k=3,\n",
|
448 | 455 | " filter={\"author\": \"aristotle\"},\n",
|
|
456 | 463 | "id": "14cb448d-91d1-4edc-8047-adcfa87b5afc",
|
457 | 464 | "metadata": {},
|
458 | 465 | "source": [
|
459 |
| - "### Deleting documents from the store" |
| 466 | + "### Delete documents from the store" |
460 | 467 | ]
|
461 | 468 | },
|
462 | 469 | {
|
|
474 | 481 | "metadata": {},
|
475 | 482 | "outputs": [],
|
476 | 483 | "source": [
|
477 |
| - "delete_1 = vstore.delete(inserted_ids[:3])\n", |
| 484 | + "delete_1 = vector_store.delete(inserted_ids[:3])\n", |
478 | 485 | "print(f\"delete result = {delete_1}\")"
|
479 | 486 | ]
|
480 | 487 | },
|
|
485 | 492 | "metadata": {},
|
486 | 493 | "outputs": [],
|
487 | 494 | "source": [
|
488 |
| - "delete_2 = vstore.delete(inserted_ids[2:5])\n", |
| 495 | + "delete_2 = vector_store.delete(inserted_ids[2:5])\n", |
489 | 496 | "print(f\"delete result = {delete_2}\")"
|
490 | 497 | ]
|
491 | 498 | },
|
|
494 | 501 | "id": "cdc96eda-0047-485b-962c-60fe329ab1b3",
|
495 | 502 | "metadata": {},
|
496 | 503 | "source": [
|
497 |
| - "### Retrieve and then delete\n", |
| 504 | + "#### Retrieve and then delete\n", |
498 | 505 | "\n",
|
499 |
| - "Sometimes you do not have the IDs, ... but you might want to run a search and then delete the results for some reason:" |
| 506 | + "Sometimes you do not have the IDs, ... but you might want to run a search and then delete the results:" |
500 | 507 | ]
|
501 | 508 | },
|
502 | 509 | {
|
|
507 | 514 | "outputs": [],
|
508 | 515 | "source": [
|
509 | 516 | "ids_to_delete = []\n",
|
510 |
| - "for res_doc, res_score, res_id in vstore.similarity_search_with_score_id(\n", |
| 517 | + "for res_doc, res_score, res_id in vector_store.similarity_search_with_score_id(\n", |
511 | 518 | " \"Philosophy has no goals\",\n",
|
512 | 519 | " k=2,\n",
|
513 | 520 | "):\n",
|
514 |
| - " print(f\"* [SIM={res_score:3f}] {res_doc.page_content} [{res_doc.metadata}]\")\n", |
| 521 | + " print(f\"* [SIM={res_score:.3f}] {res_doc.page_content} [{res_doc.metadata}]\")\n", |
515 | 522 | " ids_to_delete.append(res_id)\n",
|
516 | 523 | "\n",
|
517 |
| - "print(f\"Deleting IDs = {ids_to_delete} ...\")\n", |
518 |
| - "success = vstore.delete(ids_to_delete)\n", |
| 524 | + "print(f\"\\nDeleting IDs = {ids_to_delete} ...\")\n", |
| 525 | + "success = vector_store.delete(ids_to_delete)\n", |
519 | 526 | "print(f\"Deletion succeeded = {success}\")"
|
520 | 527 | ]
|
521 | 528 | },
|
|
534 | 541 | "metadata": {},
|
535 | 542 | "outputs": [],
|
536 | 543 | "source": [
|
537 |
| - "for res_doc, res_score, res_id in vstore.similarity_search_with_score_id(\n", |
| 544 | + "for res_doc, res_score, res_id in vector_store.similarity_search_with_score_id(\n", |
538 | 545 | " \"Philosophy has no goals\",\n",
|
539 | 546 | " k=2,\n",
|
540 | 547 | "):\n",
|
541 |
| - " print(f\"* [SIM={res_score:3f}] {res_doc.page_content} [{res_doc.metadata}]\")" |
| 548 | + " print(f\"* [SIM={res_score:.3f}] {res_doc.page_content} [{res_doc.metadata}]\")" |
542 | 549 | ]
|
543 | 550 | },
|
544 | 551 | {
|
545 | 552 | "cell_type": "markdown",
|
546 | 553 | "id": "a8280918-25ea-40f4-82c1-78d0b9b27278",
|
547 | 554 | "metadata": {},
|
548 | 555 | "source": [
|
549 |
| - "### Delete the **whole** stored data\n", |
| 556 | + "#### Delete the **whole** stored data\n", |
550 | 557 | "\n",
|
551 | 558 | "> _Warning: use with caution. Data loss!_"
|
552 | 559 | ]
|
|
558 | 565 | "metadata": {},
|
559 | 566 | "outputs": [],
|
560 | 567 | "source": [
|
561 |
| - "vstore.clear()" |
| 568 | + "vector_store.clear()" |
562 | 569 | ]
|
563 | 570 | },
|
564 | 571 | {
|
|
574 | 581 | "id": "2c08d41e-65f8-4096-8b9d-9f12ed91c485",
|
575 | 582 | "metadata": {},
|
576 | 583 | "source": [
|
577 |
| - "Let us completely delete the collection, thereby freeing the associated resources on Astra DB:\n", |
| 584 | + "Completely delete the collection, thereby freeing the associated resources on Astra DB:\n", |
578 | 585 | "\n",
|
579 | 586 | "> _Warning: use with caution. Data loss!_"
|
580 | 587 | ]
|
|
586 | 593 | "metadata": {},
|
587 | 594 | "outputs": [],
|
588 | 595 | "source": [
|
589 |
| - "vstore.delete_collection()" |
| 596 | + "vector_store.delete_collection()" |
| 597 | + ] |
| 598 | + }, |
| 599 | + { |
| 600 | + "cell_type": "markdown", |
| 601 | + "id": "41653b13-c903-4e7f-9806-afed3f4d726c", |
| 602 | + "metadata": {}, |
| 603 | + "source": [ |
| 604 | + "## Next steps" |
| 605 | + ] |
| 606 | + }, |
| 607 | + { |
| 608 | + "cell_type": "markdown", |
| 609 | + "id": "08d6c352-cdcc-46a5-9683-376f55d16b45", |
| 610 | + "metadata": {}, |
| 611 | + "source": [ |
| 612 | + "- [This quickstart on DataStax documentation](https://docs.datastax.com/en/astra-db-serverless/integrations/langchain.html)\n", |
| 613 | + "- [`AstraDBVectorStore` in LangChain docs](https://python.langchain.com/docs/integrations/providers/astradb/#vector-store)\n", |
| 614 | + "- [`AstraDBVectorStore`, API Reference](https://python.langchain.com/api_reference/astradb/vectorstores/langchain_astradb.vectorstores.AstraDBVectorStore.html#langchain_astradb.vectorstores.AstraDBVectorStore)" |
590 | 615 | ]
|
591 | 616 | }
|
592 | 617 | ],
|
|
606 | 631 | "name": "python",
|
607 | 632 | "nbconvert_exporter": "python",
|
608 | 633 | "pygments_lexer": "ipython3",
|
609 |
| - "version": "3.12.0" |
| 634 | + "version": "3.12.8" |
610 | 635 | }
|
611 | 636 | },
|
612 | 637 | "nbformat": 4,
|
|
0 commit comments