Merge pull request #119 from pinecone-io/batch-upsert

miararoy · web-flow · commit fc51ba211ee9 · 2023-01-01T13:50:20.000+02:00
Batch upsert
diff --git a/pinecone/core/grpc/index_grpc.py b/pinecone/core/grpc/index_grpc.py
@@ -10,6 +10,8 @@
 import grpc
 from google.protobuf import json_format
 from grpc._channel import _InactiveRpcError, _MultiThreadedRendezvous
+from tqdm import tqdm
+
 from pinecone import FetchResponse, QueryResponse, ScoredVector, SingleQueryResults, DescribeIndexStatsResponse
 from pinecone.config import Config
 from pinecone.core.client.model.namespace_summary import NamespaceSummary
@@ -256,6 +258,8 @@ def upsert(self,
                vectors: Union[List[GRPCVector], List[Tuple]],
                async_req: bool = False,
                namespace: Optional[str] = None,
+               batch_size: Optional[int] = None,
+               show_progress: bool = True,
                **kwargs) -> Union[UpsertResponse, PineconeGrpcFuture]:
         """
         The upsert operation writes vectors into a namespace.
@@ -282,13 +286,21 @@ def upsert(self,
 
                     Note: the dimension of each vector must match the dimension of the index.
             async_req (bool): If True, the upsert operation will be performed asynchronously.
+                              Cannot be used with batch_size.
                               Defaults to False. See: https://docs.pinecone.io/docs/performance-tuning [optional]
             namespace (str): The namespace to write to. If not specified, the default namespace is used. [optional]
+            batch_size (int): The number of vectors to upsert in each batch.
+                                Cannot be used with async_req=Ture.
+                               If not specified, all vectors will be upserted in a single batch. [optional]
+            show_progress (bool): Whether to show a progress bar using tqdm.
+                                  Applied only if batch_size is provided. Default is True.
 
         Returns: UpsertResponse, contains the number of vectors upserted
         """
-
-        args_dict = self._parse_non_empty_args([('namespace', namespace)])
+        if async_req and batch_size is not None:
+            raise ValueError('async_req is not supported when batch_size is provided.'
+                             'To upsert in parallel, please follow: '
+                             'https://docs.pinecone.io/docs/performance-tuning')
 
         def _vector_transform(item):
             if isinstance(item, GRPCVector):
@@ -300,12 +312,37 @@ def _vector_transform(item):
 
         timeout = kwargs.pop('timeout', None)
 
-        request = UpsertRequest(vectors=list(map(_vector_transform, vectors)), **args_dict, **kwargs)
+        vectors = list(map(_vector_transform, vectors))
         if async_req:
+            args_dict = self._parse_non_empty_args([('namespace', namespace)])
+            request = UpsertRequest(vectors=vectors, **args_dict, **kwargs)
             future = self._wrap_grpc_call(self.stub.Upsert.future, request, timeout=timeout)
             return PineconeGrpcFuture(future)
-        else:
-            return self._wrap_grpc_call(self.stub.Upsert, request, timeout=timeout)
+
+        if batch_size is None:
+            return self._upsert_batch(vectors, namespace, timeout=timeout, **kwargs)
+
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError('batch_size must be a positive integer')
+
+        pbar = tqdm(total=len(vectors), disable=not show_progress, desc='Upserted vectors')
+        total_upserted = 0
+        for i in range(0, len(vectors), batch_size):
+            batch_result = self._upsert_batch(vectors[i:i + batch_size], namespace, timeout=timeout, **kwargs)
+            pbar.update(batch_result.upserted_count)
+            # we can't use here pbar.n for the case show_progress=False
+            total_upserted += batch_result.upserted_count
+
+        return UpsertResponse(upserted_count=total_upserted)
+
+    def _upsert_batch(self,
+                      vectors: List[GRPCVector],
+                      namespace: Optional[str],
+                      timeout: Optional[float],
+                      **kwargs) -> UpsertResponse:
+        args_dict = self._parse_non_empty_args([('namespace', namespace)])
+        request = UpsertRequest(vectors=vectors, **args_dict)
+        return self._wrap_grpc_call(self.stub.Upsert, request, timeout=timeout, **kwargs)
 
     def delete(self,
                ids: Optional[List[str]] = None,
diff --git a/pinecone/index.py b/pinecone/index.py
@@ -1,7 +1,7 @@
 #
 # Copyright (c) 2020-2021 Pinecone Systems Inc. All right reserved.
 #
-
+from tqdm import tqdm
 from collections.abc import Iterable
 from typing import Union, List, Tuple, Optional, Dict, Any
 
@@ -64,6 +64,8 @@ def __init__(self, index_name: str, pool_threads=1):
     def upsert(self,
                vectors: Union[List[Vector], List[Tuple]],
                namespace: Optional[str] = None,
+               batch_size: Optional[int] = None,
+               show_progress: bool = True,
                **kwargs) -> UpsertResponse:
         """
         The upsert operation writes vectors into a namespace.
@@ -95,16 +97,47 @@ def upsert(self,
                     Note: the dimension of each vector must match the dimension of the index.
 
             namespace (str): The namespace to write to. If not specified, the default namespace is used. [optional]
-
+            batch_size (int): The number of vectors to upsert in each batch.
+                               If not specified, all vectors will be upserted in a single batch. [optional]
+            show_progress (bool): Whether to show a progress bar using tqdm.
+                                  Applied only if batch_size is provided. Default is True.
         Keyword Args:
             Supports OpenAPI client keyword arguments. See pinecone.core.client.models.UpsertRequest for more details.
 
         Returns: UpsertResponse, includes the number of vectors upserted.
         """
         _check_type = kwargs.pop('_check_type', False)
+
+        if kwargs.get('async_req', False) and batch_size is not None:
+            raise ValueError('async_req is not supported when batch_size is provided.'
+                             'To upsert in parallel, please follow: '
+                             'https://docs.pinecone.io/docs/insert-data#sending-upserts-in-parallel')
+
+        if batch_size is None:
+            return self._upsert_batch(vectors, namespace, _check_type, **kwargs)
+
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError('batch_size must be a positive integer')
+
+        pbar = tqdm(total=len(vectors), disable=not show_progress, desc='Upserted vectors')
+        total_upserted = 0
+        for i in range(0, len(vectors), batch_size):
+            batch_result = self._upsert_batch(vectors[i:i + batch_size], namespace, _check_type, **kwargs)
+            pbar.update(batch_result.upserted_count)
+            # we can't use here pbar.n for the case show_progress=False
+            total_upserted += batch_result.upserted_count
+
+        return UpsertResponse(upserted_count=total_upserted)
+
+    def _upsert_batch(self,
+                      vectors: List[Vector],
+                      namespace: Optional[str],
+                      _check_type: bool,
+                      **kwargs) -> UpsertResponse:
+
         args_dict = self._parse_non_empty_args([('namespace', namespace)])
 
-        def _vector_transform(item):
+        def _vector_transform(item: Union[Vector, Tuple]):
             if isinstance(item, Vector):
                 return item
             if isinstance(item, tuple):
diff --git a/requirements.txt b/requirements.txt
@@ -5,4 +5,5 @@ typing-extensions>=3.7.4
 dnspython>=2.0.0
 # openapi generated client:
 python_dateutil >= 2.5.3
-urllib3 >= 1.21.1
+urllib3 >= 1.21.1
+tqdm >= 4.64.1
diff --git a/tests/unit/test_grpc_index.py b/tests/unit/test_grpc_index.py
@@ -1,8 +1,10 @@
+import pytest
+
 import pinecone
 from core.utils import dict_to_proto_struct
 from pinecone import DescribeIndexStatsRequest
 from pinecone.core.grpc.protos.vector_service_pb2 import Vector, DescribeIndexStatsRequest, UpdateRequest, \
-    UpsertRequest, FetchRequest, QueryRequest, DeleteRequest, QueryVector
+    UpsertRequest, FetchRequest, QueryRequest, DeleteRequest, QueryVector, UpsertResponse
 
 
 class TestGrpcIndex:
@@ -75,6 +77,128 @@ def test_upsert_async_upsertInputVectorsAsync(self, mocker):
                 namespace='ns'),
             timeout=None)
 
+    def test_upsert_vectorListIsMultiplyOfBatchSize_vectorsUpsertedInBatches(self, mocker):
+        mocker.patch.object(self.index, '_wrap_grpc_call', autospec=True,
+                            side_effect=lambda stub, upsert_request, timeout: UpsertResponse(
+                                upserted_count=len(upsert_request.vectors)))
+
+        result = self.index.upsert([Vector(id='vec1', values=self.vals1, metadata=dict_to_proto_struct(self.md1)),
+                                    Vector(id='vec2', values=self.vals2, metadata=dict_to_proto_struct(self.md2))],
+                                   namespace='ns',
+                                   batch_size=1,
+                                   show_progress=False)
+        self.index._wrap_grpc_call.assert_any_call(
+            self.index.stub.Upsert,
+            UpsertRequest(
+                vectors=[
+                    Vector(id='vec1', values=self.vals1, metadata=dict_to_proto_struct(self.md1))],
+                namespace='ns'),
+            timeout=None)
+
+        self.index._wrap_grpc_call.assert_any_call(
+            self.index.stub.Upsert,
+            UpsertRequest(
+                vectors=[Vector(id='vec2', values=self.vals2, metadata=dict_to_proto_struct(self.md2))],
+                namespace='ns'),
+            timeout=None)
+
+        assert result.upserted_count == 2
+
+    def test_upsert_vectorListNotMultiplyOfBatchSize_vectorsUpsertedInBatches(self, mocker):
+        mocker.patch.object(self.index, '_wrap_grpc_call', autospec=True,
+                            side_effect=lambda stub, upsert_request, timeout: UpsertResponse(
+                                upserted_count=len(upsert_request.vectors)))
+
+        result = self.index.upsert([Vector(id='vec1', values=self.vals1, metadata=dict_to_proto_struct(self.md1)),
+                                    Vector(id='vec2', values=self.vals2, metadata=dict_to_proto_struct(self.md2)),
+                                    Vector(id='vec3', values=self.vals1, metadata=dict_to_proto_struct(self.md1))],
+                                   namespace='ns',
+                                   batch_size=2)
+        self.index._wrap_grpc_call.assert_any_call(
+            self.index.stub.Upsert,
+            UpsertRequest(
+                vectors=[
+                    Vector(id='vec1', values=self.vals1, metadata=dict_to_proto_struct(self.md1)),
+                    Vector(id='vec2', values=self.vals2, metadata=dict_to_proto_struct(self.md2))],
+                namespace='ns'),
+            timeout=None)
+
+        self.index._wrap_grpc_call.assert_any_call(
+            self.index.stub.Upsert,
+            UpsertRequest(
+                vectors=[Vector(id='vec3', values=self.vals1, metadata=dict_to_proto_struct(self.md1))],
+                namespace='ns'),
+            timeout=None)
+
+        assert result.upserted_count == 3
+
+    def test_upsert_vectorListSmallerThanBatchSize_vectorsUpsertedInBatches(self, mocker):
+        mocker.patch.object(self.index, '_wrap_grpc_call', autospec=True,
+                            side_effect=lambda stub, upsert_request, timeout: UpsertResponse(
+                                upserted_count=len(upsert_request.vectors)))
+
+        result = self.index.upsert([Vector(id='vec1', values=self.vals1, metadata=dict_to_proto_struct(self.md1)),
+                                    Vector(id='vec2', values=self.vals2, metadata=dict_to_proto_struct(self.md2))],
+                                   namespace='ns',
+                                   batch_size=5)
+
+        self.index._wrap_grpc_call.assert_called_once_with(
+            self.index.stub.Upsert,
+            UpsertRequest(
+                vectors=[
+                    Vector(id='vec1', values=self.vals1, metadata=dict_to_proto_struct(self.md1)),
+                    Vector(id='vec2', values=self.vals2, metadata=dict_to_proto_struct(self.md2))],
+                namespace='ns'),
+            timeout=None)
+
+        assert result.upserted_count == 2
+
+    def test_upsert_tuplesList_vectorsUpsertedInBatches(self, mocker):
+        mocker.patch.object(self.index, '_wrap_grpc_call', autospec=True,
+                            side_effect=lambda stub, upsert_request, timeout: UpsertResponse(
+                                upserted_count=len(upsert_request.vectors)))
+
+        result = self.index.upsert([('vec1', self.vals1, self.md1),
+                                    ('vec2', self.vals2, self.md2),
+                                    ('vec3', self.vals1, self.md1)],
+                                   namespace='ns',
+                                   batch_size=2)
+        self.index._wrap_grpc_call.assert_any_call(
+            self.index.stub.Upsert,
+            UpsertRequest(
+                vectors=[
+                    Vector(id='vec1', values=self.vals1, metadata=dict_to_proto_struct(self.md1)),
+                    Vector(id='vec2', values=self.vals2, metadata=dict_to_proto_struct(self.md2))],
+                namespace='ns'),
+            timeout=None)
+
+        self.index._wrap_grpc_call.assert_any_call(
+            self.index.stub.Upsert,
+            UpsertRequest(
+                vectors=[Vector(id='vec3', values=self.vals1, metadata=dict_to_proto_struct(self.md1))],
+                namespace='ns'),
+            timeout=None)
+
+        assert result.upserted_count == 3
+
+    def test_upsert_batchSizeIsNotPositive_errorIsRaised(self):
+        with pytest.raises(ValueError, match='batch_size must be a positive integer'):
+            self.index.upsert([Vector(id='vec1', values=self.vals1, metadata=dict_to_proto_struct(self.md1))],
+                              namespace='ns',
+                              batch_size=0)
+
+        with pytest.raises(ValueError, match='batch_size must be a positive integer'):
+            self.index.upsert([Vector(id='vec1', values=self.vals1, metadata=dict_to_proto_struct(self.md1))],
+                              namespace='ns',
+                              batch_size=-1)
+
+    def test_upsert_useBatchSizeAndAsyncReq_valueErrorRaised(self):
+        with pytest.raises(ValueError, match='async_req is not supported when batch_size is provided.'):
+            self.index.upsert([Vector(id='vec1', values=self.vals1, metadata=dict_to_proto_struct(self.md1))],
+                              namespace='ns',
+                              batch_size=2,
+                              async_req=True)
+
     # endregion
 
     # region: query tests
diff --git a/tests/unit/test_index.py b/tests/unit/test_index.py