From 2725fc4abc9caa9af0d1c7d1d97291d2a81e9497 Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Wed, 26 Mar 2025 14:20:56 -0400 Subject: [PATCH 1/4] Change wording --- docs/langchain-retrieval-agent.ipynb | 2 +- docs/langchain-retrieval-augmentation.ipynb | 2 +- docs/semantic-search.ipynb | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/langchain-retrieval-agent.ipynb b/docs/langchain-retrieval-agent.ipynb index b1b8dc0f..d984e02e 100644 --- a/docs/langchain-retrieval-agent.ipynb +++ b/docs/langchain-retrieval-agent.ipynb @@ -421,7 +421,7 @@ "\n", "- `name` can be anything we like. The name is used as an identifier for the index when performing other operations such as `describe_index`, `delete_index`, and so on. \n", "- `metric` specifies the similarity metric that will be used later when you make queries to the index.\n", - "- `dimension` should correspond to the dimension of the dense vectors produced by your embedding model. In this quick start, we are using made-up data so a small value is simplest.\n", + "- `dimension` should correspond to the dimension of the dense vectors produced by your embedding model.\n", "- `spec` holds a specification which tells Pinecone how you would like to deploy our index. You can find a list of all [available providers and regions here](https://docs.pinecone.io/troubleshooting/available-cloud-regions).\n", "\n", "There are more configurations available, but this minimal set will get us started." diff --git a/docs/langchain-retrieval-augmentation.ipynb b/docs/langchain-retrieval-augmentation.ipynb index be3f8858..b5bf49e0 100644 --- a/docs/langchain-retrieval-augmentation.ipynb +++ b/docs/langchain-retrieval-augmentation.ipynb @@ -254,7 +254,7 @@ "\n", "- `name` can be anything we like. The name is used as an identifier for the index when performing other operations such as `describe_index`, `delete_index`, and so on. \n", "- `metric` specifies the similarity metric that will be used later when you make queries to the index.\n", - "- `dimension` should correspond to the dimension of the dense vectors produced by your embedding model. In this quick start, we are using made-up data so a small value is simplest.\n", + "- `dimension` should correspond to the dimension of the dense vectors produced by your embedding model.\n", "- `spec` holds a specification which tells Pinecone how you would like to deploy our index. You can find a list of all [available providers and regions here](https://docs.pinecone.io/docs/projects).\n", "\n", "There are more configurations available, but this minimal set will get us started." diff --git a/docs/semantic-search.ipynb b/docs/semantic-search.ipynb index e15ae5e0..fde2b31c 100644 --- a/docs/semantic-search.ipynb +++ b/docs/semantic-search.ipynb @@ -326,7 +326,7 @@ "\n", "- `name` can be anything we like. The name is used as an identifier for the index when performing other operations such as `describe_index`, `delete_index`, and so on. \n", "- `metric` specifies the similarity metric that will be used later when you make queries to the index.\n", - "- `dimension` should correspond to the dimension of the dense vectors produced by your embedding model. In this quick start, we are using made-up data so a small value is simplest.\n", + "- `dimension` should correspond to the dimension of the dense vectors produced by your embedding model.\n", "- `spec` holds a specification which tells Pinecone how you would like to deploy our index. You can find a list of all [available providers and regions here](https://docs.pinecone.io/docs/projects).\n", "\n", "There are more configurations available, but this minimal set will get us started." From b7229cb6bbe5711573f1a504accf93719e2412d6 Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Wed, 26 Mar 2025 14:22:25 -0400 Subject: [PATCH 2/4] Black formatting --- docs/langchain-retrieval-agent.ipynb | 60 ++++++++++----------- docs/langchain-retrieval-augmentation.ipynb | 47 +++++++--------- docs/semantic-search.ipynb | 39 +++++++------- 3 files changed, 69 insertions(+), 77 deletions(-) diff --git a/docs/langchain-retrieval-agent.ipynb b/docs/langchain-retrieval-agent.ipynb index d984e02e..11fe65b3 100644 --- a/docs/langchain-retrieval-agent.ipynb +++ b/docs/langchain-retrieval-agent.ipynb @@ -354,7 +354,7 @@ ], "source": [ "# we drop sparse_values as they are not needed for this example\n", - "dataset.documents.drop(['sparse_values', 'blob'], axis=1, inplace=True)\n", + "dataset.documents.drop([\"sparse_values\", \"blob\"], axis=1, inplace=True)\n", "\n", "dataset.head()" ] @@ -369,7 +369,7 @@ "\n", "print(\"Here are some example topics in our Knowledge Base:\\n\")\n", "for r in dataset.documents.iloc[:].to_dict(orient=\"records\"):\n", - " topics.add(r['metadata']['title'])\n", + " topics.add(r[\"metadata\"][\"title\"])\n", "\n", "for topic in sorted(topics)[50:75]:\n", " print(f\"- {topic}\")" @@ -396,6 +396,7 @@ "\n", "if not os.environ.get(\"PINECONE_API_KEY\"):\n", " from pinecone_notebooks.colab import Authenticate\n", + "\n", " Authenticate()" ] }, @@ -464,18 +465,15 @@ "source": [ "from pinecone import ServerlessSpec\n", "\n", - "index_name = 'langchain-retrieval-agent-fast'\n", + "index_name = \"langchain-retrieval-agent-fast\"\n", "\n", "if not pc.has_index(name=index_name):\n", " # Create a new index\n", " pc.create_index(\n", " name=index_name,\n", " dimension=1536, # dimensionality of text-embedding-ada-002\n", - " metric='dotproduct',\n", - " spec=ServerlessSpec(\n", - " cloud='aws',\n", - " region='us-east-1'\n", - " )\n", + " metric=\"dotproduct\",\n", + " spec=ServerlessSpec(cloud=\"aws\", region=\"us-east-1\"),\n", " )\n", "\n", "pc.describe_index(name=index_name)" @@ -651,12 +649,9 @@ "source": [ "from langchain_openai import OpenAIEmbeddings\n", "\n", - "openai_api_key = os.environ.get('OPENAI_API_KEY') or 'OPENAI_API_KEY'\n", + "openai_api_key = os.environ.get(\"OPENAI_API_KEY\") or \"OPENAI_API_KEY\"\n", "\n", - "embed = OpenAIEmbeddings(\n", - " model='text-embedding-ada-002',\n", - " openai_api_key=openai_api_key\n", - ")" + "embed = OpenAIEmbeddings(model=\"text-embedding-ada-002\", openai_api_key=openai_api_key)" ] }, { @@ -670,9 +665,7 @@ "from langchain_pinecone import PineconeVectorStore\n", "\n", "pinecone_vectorstore = PineconeVectorStore(\n", - " index_name=index_name, \n", - " embedding=embed, \n", - " text_key=\"text\"\n", + " index_name=index_name, embedding=embed, text_key=\"text\"\n", ")" ] }, @@ -759,11 +752,12 @@ "source": [ "from pprint import pprint\n", "\n", - "query = \"When was the college of engineering in the University of Notre Dame established?\"\n", + "query = (\n", + " \"When was the college of engineering in the University of Notre Dame established?\"\n", + ")\n", "\n", "documents = pinecone_vectorstore.similarity_search(\n", - " query=query,\n", - " k=3 # return 3 most relevant docs\n", + " query=query, k=3 # return 3 most relevant docs\n", ")\n", "\n", "for doc in documents:\n", @@ -815,9 +809,7 @@ "\n", "# Chat completion LLM\n", "llm = ChatOpenAI(\n", - " openai_api_key=openai_api_key,\n", - " model_name='gpt-3.5-turbo',\n", - " temperature=0.0\n", + " openai_api_key=openai_api_key, model_name=\"gpt-3.5-turbo\", temperature=0.0\n", ")" ] }, @@ -839,7 +831,7 @@ "from langchain_core.runnables import RunnablePassthrough\n", "\n", "# Based on the RAG template from https://smith.langchain.com/hub/rlm/rag-prompt\n", - "template=(\n", + "template = (\n", " \"You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\"\n", " \"Question: {question}\"\n", " \"Context: {context}\"\n", @@ -847,9 +839,11 @@ ")\n", "prompt = PromptTemplate(input_variables=[\"question\", \"context\"], template=template)\n", "\n", + "\n", "def format_docs(docs):\n", " return \"\\n\\n\".join(doc.page_content for doc in docs)\n", "\n", + "\n", "# Retrieval Question-Answer chain\n", "qa_chain = (\n", " {\n", @@ -896,7 +890,9 @@ } ], "source": [ - "qa_chain.invoke(\"When was the college of engineering in the University of Notre Dame established?\")" + "qa_chain.invoke(\n", + " \"When was the college of engineering in the University of Notre Dame established?\"\n", + ")" ] }, { @@ -920,11 +916,11 @@ "outputs": [], "source": [ "knowledge_base_tool = qa_chain.as_tool(\n", - " name='knowledge-base',\n", - " description=(\n", - " 'use this tool when answering general knowledge queries to get '\n", - " 'more information about the topic'\n", - " )\n", + " name=\"knowledge-base\",\n", + " description=(\n", + " \"use this tool when answering general knowledge queries to get \"\n", + " \"more information about the topic\"\n", + " ),\n", ")" ] }, @@ -966,9 +962,11 @@ "from langgraph.graph import StateGraph\n", "from langgraph.graph.message import add_messages\n", "\n", + "\n", "class State(TypedDict):\n", " messages: Annotated[list, add_messages]\n", "\n", + "\n", "graph_builder = StateGraph(State)" ] }, @@ -1001,9 +999,11 @@ "tools = [knowledge_base_tool]\n", "llm_with_tools = llm.bind_tools(tools)\n", "\n", + "\n", "def chatbot(state: State):\n", " return {\"messages\": [llm_with_tools.invoke(state[\"messages\"])]}\n", "\n", + "\n", "graph_builder.add_node(\"chatbot\", chatbot)\n", "\n", "tool_node = ToolNode(tools=tools)\n", @@ -1054,7 +1054,7 @@ "source": [ "def agent(user_message):\n", " config = {\"configurable\": {\"thread_id\": \"1\"}}\n", - " \n", + "\n", " # The config is the **second positional argument** to stream() or invoke()!\n", " events = graph.stream(\n", " {\"messages\": [{\"role\": \"user\", \"content\": user_message}]},\n", diff --git a/docs/langchain-retrieval-augmentation.ipynb b/docs/langchain-retrieval-augmentation.ipynb index b5bf49e0..75a1b783 100644 --- a/docs/langchain-retrieval-augmentation.ipynb +++ b/docs/langchain-retrieval-augmentation.ipynb @@ -180,11 +180,11 @@ "source": [ "from pinecone_datasets import load_dataset\n", "\n", - "dataset = load_dataset('wikipedia-simple-text-embedding-ada-002-50K')\n", + "dataset = load_dataset(\"wikipedia-simple-text-embedding-ada-002-50K\")\n", "\n", "# We drop sparse_values and blob keys as they are not needed for this example\n", - "dataset.documents.drop(['sparse_values'], axis=1, inplace=True)\n", - "dataset.documents.drop(['blob'], axis=1, inplace=True)\n", + "dataset.documents.drop([\"sparse_values\"], axis=1, inplace=True)\n", + "dataset.documents.drop([\"blob\"], axis=1, inplace=True)\n", "\n", "dataset.head()" ] @@ -220,6 +220,7 @@ "\n", "if not os.environ.get(\"PINECONE_API_KEY\"):\n", " from pinecone_notebooks.colab import Authenticate\n", + "\n", " Authenticate()" ] }, @@ -301,17 +302,14 @@ "source": [ "from pinecone import ServerlessSpec\n", "\n", - "index_name = 'langchain-retrieval-augmentation-fast'\n", + "index_name = \"langchain-retrieval-augmentation-fast\"\n", "\n", "if not pc.has_index(name=index_name):\n", " pc.create_index(\n", " name=index_name,\n", " dimension=1536, # dimensionality of text-embedding-ada-002\n", - " metric='dotproduct',\n", - " spec=ServerlessSpec(\n", - " cloud='aws',\n", - " region='us-east-1'\n", - " )\n", + " metric=\"dotproduct\",\n", + " spec=ServerlessSpec(cloud=\"aws\", region=\"us-east-1\"),\n", " )\n", "\n", "pc.describe_index(name=index_name)" @@ -420,8 +418,10 @@ "\n", "batch_size = 100\n", "\n", - "for start in tqdm(range(0, len(dataset.documents), batch_size), \"Upserting records batch\"):\n", - " batch = dataset.documents.iloc[start:start + batch_size].to_dict(orient=\"records\")\n", + "for start in tqdm(\n", + " range(0, len(dataset.documents), batch_size), \"Upserting records batch\"\n", + "):\n", + " batch = dataset.documents.iloc[start : start + batch_size].to_dict(orient=\"records\")\n", " index.upsert(vectors=batch)" ] }, @@ -487,14 +487,11 @@ "from langchain_openai import OpenAIEmbeddings\n", "\n", "# Get openai api key from platform.openai.com\n", - "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or 'OPENAI_API_KEY'\n", + "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\") or \"OPENAI_API_KEY\"\n", "\n", - "model_name = 'text-embedding-ada-002'\n", + "model_name = \"text-embedding-ada-002\"\n", "\n", - "embed = OpenAIEmbeddings(\n", - " model=model_name,\n", - " openai_api_key=OPENAI_API_KEY\n", - ")" + "embed = OpenAIEmbeddings(model=model_name, openai_api_key=OPENAI_API_KEY)" ] }, { @@ -518,9 +515,7 @@ "from langchain_pinecone import PineconeVectorStore\n", "\n", "pinecone_vectorstore = PineconeVectorStore(\n", - " index_name=index_name, \n", - " embedding=embed, \n", - " text_key=\"text\"\n", + " index_name=index_name, embedding=embed, text_key=\"text\"\n", ")" ] }, @@ -670,7 +665,7 @@ "\n", "documents = pinecone_vectorstore.similarity_search(\n", " query=\"Who was Benito Mussolini?\", # our search query\n", - " k=3 # return 3 most relevant docs\n", + " k=3, # return 3 most relevant docs\n", ")\n", "\n", "for doc in documents:\n", @@ -707,15 +702,11 @@ "\n", "# Chat Completion LLM\n", "llm = ChatOpenAI(\n", - " openai_api_key=OPENAI_API_KEY,\n", - " model_name='gpt-4.5-preview',\n", - " temperature=0.0\n", + " openai_api_key=OPENAI_API_KEY, model_name=\"gpt-4.5-preview\", temperature=0.0\n", ")\n", "\n", "qa = RetrievalQA.from_chain_type(\n", - " llm=llm,\n", - " chain_type=\"stuff\",\n", - " retriever=pinecone_vectorstore.as_retriever()\n", + " llm=llm, chain_type=\"stuff\", retriever=pinecone_vectorstore.as_retriever()\n", ")" ] }, @@ -771,7 +762,7 @@ " llm=llm,\n", " chain_type=\"stuff\",\n", " retriever=pinecone_vectorstore.as_retriever(),\n", - " return_source_documents=True\n", + " return_source_documents=True,\n", ")" ] }, diff --git a/docs/semantic-search.ipynb b/docs/semantic-search.ipynb index fde2b31c..c5953c91 100644 --- a/docs/semantic-search.ipynb +++ b/docs/semantic-search.ipynb @@ -173,14 +173,14 @@ "source": [ "from pinecone_datasets import load_dataset\n", "\n", - "dataset = load_dataset('quora_all-MiniLM-L6-bm25')\n", + "dataset = load_dataset(\"quora_all-MiniLM-L6-bm25\")\n", "\n", "# The metadata we need is actually stored in the \"blob\" column so let's rename it\n", - "dataset.documents.drop(['metadata'], axis=1, inplace=True)\n", - "dataset.documents.rename(columns={'blob': 'metadata'}, inplace=True)\n", + "dataset.documents.drop([\"metadata\"], axis=1, inplace=True)\n", + "dataset.documents.rename(columns={\"blob\": \"metadata\"}, inplace=True)\n", "\n", "# We don't need sparse_values for this demo either so let's drop those as well\n", - "dataset.documents.drop(['sparse_values'], axis=1, inplace=True)\n", + "dataset.documents.drop([\"sparse_values\"], axis=1, inplace=True)\n", "\n", "# To speed things up in this demo, we will use 80K rows of the dataset between rows 240K -> 320K\n", "dataset.documents.drop(dataset.documents.index[320_000:], inplace=True)\n", @@ -233,7 +233,7 @@ ], "source": [ "row1 = dataset.documents.iloc[0:1].to_dict(orient=\"records\")[0]\n", - "dimension = len(row1['values'])\n", + "dimension = len(row1[\"values\"])\n", "print(f\"These embeddings have dimension {dimension}\")" ] }, @@ -264,7 +264,7 @@ "source": [ "print(\"Here are some example questions in the data set:\\n\")\n", "for r in dataset.documents.iloc[0:10].to_dict(orient=\"records\"):\n", - " print(\" -\" + r['metadata']['text'])" + " print(\" -\" + r[\"metadata\"][\"text\"])" ] }, { @@ -290,6 +290,7 @@ "\n", "if not os.environ.get(\"PINECONE_API_KEY\"):\n", " from pinecone_notebooks.colab import Authenticate\n", + "\n", " Authenticate()" ] }, @@ -362,19 +363,16 @@ "source": [ "from pinecone import ServerlessSpec\n", "\n", - "index_name = 'semantic-search-fast'\n", + "index_name = \"semantic-search-fast\"\n", "\n", "# Check if index already exists (it shouldn't if this is first time running the demo)\n", "if not pc.has_index(name=index_name):\n", " # If does not exist, create index\n", " pc.create_index(\n", " name=index_name,\n", - " dimension=384, # dimensionality of MiniLM\n", - " metric='dotproduct',\n", - " spec = ServerlessSpec(\n", - " cloud='aws', \n", - " region='us-east-1'\n", - " )\n", + " dimension=384, # dimensionality of MiniLM\n", + " metric=\"dotproduct\",\n", + " spec=ServerlessSpec(cloud=\"aws\", region=\"us-east-1\"),\n", " )\n", "\n", "# Initialize index client\n", @@ -429,8 +427,10 @@ "\n", "batch_size = 100\n", "\n", - "for start in tqdm(range(0, len(dataset.documents), batch_size), \"Upserting records batch\"):\n", - " batch = dataset.documents.iloc[start:start + batch_size].to_dict(orient=\"records\")\n", + "for start in tqdm(\n", + " range(0, len(dataset.documents), batch_size), \"Upserting records batch\"\n", + "):\n", + " batch = dataset.documents.iloc[start : start + batch_size].to_dict(orient=\"records\")\n", " index.upsert(vectors=batch)" ] }, @@ -639,9 +639,9 @@ "from sentence_transformers import SentenceTransformer\n", "import torch\n", "\n", - "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "\n", - "model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)\n", + "model = SentenceTransformer(\"sentence-transformers/all-MiniLM-L6-v2\", device=device)\n", "model" ] }, @@ -671,7 +671,7 @@ " xq = model.encode(question).tolist()\n", "\n", " # Now query Pinecone to find similar questions\n", - " return index.query(vector=xq, top_k=5, include_metadata=True)\n" + " return index.query(vector=xq, top_k=5, include_metadata=True)" ] }, { @@ -761,9 +761,10 @@ ], "source": [ "def print_query_results(results):\n", - " for result in results['matches']:\n", + " for result in results[\"matches\"]:\n", " print(f\"{round(result['score'], 2)}: {result['metadata']['text']}\")\n", "\n", + "\n", "print_query_results(xc)" ] }, From 3e0af2be05d4dfff6c578c61c6b3ae74262a9f74 Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Wed, 26 Mar 2025 14:25:52 -0400 Subject: [PATCH 3/4] Update links --- docs/semantic-search.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/semantic-search.ipynb b/docs/semantic-search.ipynb index c5953c91..efae75fc 100644 --- a/docs/semantic-search.ipynb +++ b/docs/semantic-search.ipynb @@ -328,7 +328,7 @@ "- `name` can be anything we like. The name is used as an identifier for the index when performing other operations such as `describe_index`, `delete_index`, and so on. \n", "- `metric` specifies the similarity metric that will be used later when you make queries to the index.\n", "- `dimension` should correspond to the dimension of the dense vectors produced by your embedding model.\n", - "- `spec` holds a specification which tells Pinecone how you would like to deploy our index. You can find a list of all [available providers and regions here](https://docs.pinecone.io/docs/projects).\n", + "- `spec` holds a specification which tells Pinecone how you would like to deploy our index. You can find a list of all [available providers and regions here](https://docs.pinecone.io/guides/projects/understanding-projects).\n", "\n", "There are more configurations available, but this minimal set will get us started." ] From bb441c6c643ce1ad115bd0a64d3a894d9e3aef8a Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Wed, 26 Mar 2025 14:42:52 -0400 Subject: [PATCH 4/4] Update links --- docs/langchain-retrieval-augmentation.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/langchain-retrieval-augmentation.ipynb b/docs/langchain-retrieval-augmentation.ipynb index 75a1b783..5f52e306 100644 --- a/docs/langchain-retrieval-augmentation.ipynb +++ b/docs/langchain-retrieval-augmentation.ipynb @@ -17,7 +17,7 @@ "id": "dQRA1HWOJYbU" }, "source": [ - "#### [LangChain Handbook](https://pinecone.io/learn/langchain)\n", + "#### [LangChain Handbook](https://www.pinecone.io/learn/langchain)\n", "\n", "# Retrieval Augmentation\n", "\n", @@ -256,7 +256,7 @@ "- `name` can be anything we like. The name is used as an identifier for the index when performing other operations such as `describe_index`, `delete_index`, and so on. \n", "- `metric` specifies the similarity metric that will be used later when you make queries to the index.\n", "- `dimension` should correspond to the dimension of the dense vectors produced by your embedding model.\n", - "- `spec` holds a specification which tells Pinecone how you would like to deploy our index. You can find a list of all [available providers and regions here](https://docs.pinecone.io/docs/projects).\n", + "- `spec` holds a specification which tells Pinecone how you would like to deploy our index. You can find a list of all [available providers and regions here](https://docs.pinecone.io/guides/projects/understanding-projects).\n", "\n", "There are more configurations available, but this minimal set will get us started." ]