147
147
Pinecone index to be returned by `build_index`.
148
148
"""
149
149
@kwdef mutable struct PineconeIndexer <: AbstractIndexBuilder
150
- chunker:: AbstractChunker = TextChunker ()
150
+ chunker:: AbstractChunker = FileChunker ()
151
151
embedder:: AbstractEmbedder = SimpleEmbedder ()
152
152
tagger:: AbstractTagger = NoTagger ()
153
153
end
@@ -726,7 +726,8 @@ function build_index(
726
726
return index
727
727
end
728
728
729
- using Pinecone: Pinecone, PineconeContextv3, PineconeIndexv3, init_v3, Index
729
+ using Pinecone: Pinecone, PineconeContextv3, PineconeIndexv3, init_v3, Index, PineconeVector, upsert
730
+ using UUIDs: UUIDs, uuid4
730
731
# TODO : change docs
731
732
"""
732
733
build_index(
@@ -739,18 +740,60 @@ using Pinecone: Pinecone, PineconeContextv3, PineconeIndexv3, init_v3, Index
739
740
Builds a `PineconeIndex` containing a Pinecone context (API key, index and namespace).
740
741
"""
741
742
function build_index (
742
- indexer:: PineconeIndexer ,
743
+ indexer:: PineconeIndexer , files_or_docs:: Vector{<:AbstractString} ;
744
+ metadata:: Vector{Dict{String, Any}} = Vector {Dict{String, Any}} (),
743
745
context:: Pinecone.PineconeContextv3 = Pinecone. init_v3 (" " ),
744
746
index:: Pinecone.PineconeIndexv3 = nothing ,
745
747
namespace:: AbstractString = " " ,
748
+ upsert:: Bool = false ,
746
749
verbose:: Integer = 1 ,
747
750
index_id = gensym (namespace),
751
+ chunker:: AbstractChunker = indexer. chunker,
752
+ chunker_kwargs:: NamedTuple = NamedTuple (),
753
+ embedder:: AbstractEmbedder = indexer. embedder,
754
+ embedder_kwargs:: NamedTuple = NamedTuple (),
755
+ tagger:: AbstractTagger = indexer. tagger,
756
+ tagger_kwargs:: NamedTuple = NamedTuple (),
757
+ api_kwargs:: NamedTuple = NamedTuple (),
748
758
cost_tracker = Threads. Atomic {Float64} (0.0 ))
749
759
@assert ! isempty (context. apikey) && ! isnothing (index) " Pinecone context and index not set"
750
760
751
- # TODO : add chunking, embedding, tags?
761
+ # # Split into chunks
762
+ chunks, sources = get_chunks (chunker, files_or_docs;
763
+ chunker_kwargs... )
764
+ # # Get metadata for each chunk
765
+ if isempty (metadata)
766
+ metadata = [Dict {String, Any} () for _ in sources]
767
+ else
768
+ metadata = [metadata[findfirst (f -> f == source, files_or_docs)] for source in sources]
769
+ [metadata[idx][" content" ] = chunk for (idx, chunk) in enumerate (chunks)]
770
+ end
771
+
772
+ # # Embed chunks
773
+ embeddings = get_embeddings (embedder, chunks;
774
+ verbose = (verbose > 1 ),
775
+ cost_tracker,
776
+ api_kwargs, embedder_kwargs... )
777
+
778
+ # # Extract tags
779
+ tags_extracted = get_tags (tagger, chunks;
780
+ verbose = (verbose > 1 ),
781
+ cost_tracker,
782
+ api_kwargs, tagger_kwargs... )
783
+ # Build the sparse matrix and the vocabulary
784
+ tags, tags_vocab = build_tags (tagger, tags_extracted)
785
+
786
+ # Upsert to Pinecone
787
+ if upsert
788
+ embeddings_arr = [embeddings[:,i] for i in axes (embeddings,2 )]
789
+ for (idx, emb) in enumerate (embeddings_arr)
790
+ pinevector = Pinecone. PineconeVector (string (UUIDs. uuid4 ()), emb, metadata[idx])
791
+ Pinecone. upsert (context, index, [pinevector], namespace)
792
+ @info " Upsert #$idx complete"
793
+ end
794
+ end
752
795
753
- index = PineconeIndex (; id = index_id, context, index, namespace)
796
+ index = PineconeIndex (; id = index_id, context, index, namespace, chunks, embeddings, tags, tags_vocab, metadata, sources )
754
797
755
798
(verbose > 0 ) && @info " Index built! (cost: \$ $(round (cost_tracker[], digits= 3 )) )"
756
799
0 commit comments