147
147
Pinecone index to be returned by `build_index`.
148
148
"""
149
149
@kwdef mutable struct PineconeIndexer <: AbstractIndexBuilder
150
- chunker:: AbstractChunker = TextChunker ()
150
+ chunker:: AbstractChunker = FileChunker ()
151
151
embedder:: AbstractEmbedder = SimpleEmbedder ()
152
152
tagger:: AbstractTagger = NoTagger ()
153
153
end
@@ -720,7 +720,8 @@ function build_index(
720
720
return index
721
721
end
722
722
723
- using Pinecone: Pinecone, PineconeContextv3, PineconeIndexv3, init_v3, Index
723
+ using Pinecone: Pinecone, PineconeContextv3, PineconeIndexv3, init_v3, Index, PineconeVector, upsert
724
+ using UUIDs: UUIDs, uuid4
724
725
# TODO : change docs
725
726
"""
726
727
build_index(
@@ -733,18 +734,60 @@ using Pinecone: Pinecone, PineconeContextv3, PineconeIndexv3, init_v3, Index
733
734
Builds a `PineconeIndex` containing a Pinecone context (API key, index and namespace).
734
735
"""
735
736
function build_index (
736
- indexer:: PineconeIndexer ,
737
+ indexer:: PineconeIndexer , files_or_docs:: Vector{<:AbstractString} ;
738
+ metadata:: Vector{Dict{String, Any}} = Vector {Dict{String, Any}} (),
737
739
context:: Pinecone.PineconeContextv3 = Pinecone. init_v3 (" " ),
738
740
index:: Pinecone.PineconeIndexv3 = nothing ,
739
741
namespace:: AbstractString = " " ,
742
+ upsert:: Bool = false ,
740
743
verbose:: Integer = 1 ,
741
744
index_id = gensym (namespace),
745
+ chunker:: AbstractChunker = indexer. chunker,
746
+ chunker_kwargs:: NamedTuple = NamedTuple (),
747
+ embedder:: AbstractEmbedder = indexer. embedder,
748
+ embedder_kwargs:: NamedTuple = NamedTuple (),
749
+ tagger:: AbstractTagger = indexer. tagger,
750
+ tagger_kwargs:: NamedTuple = NamedTuple (),
751
+ api_kwargs:: NamedTuple = NamedTuple (),
742
752
cost_tracker = Threads. Atomic {Float64} (0.0 ))
743
753
@assert ! isempty (context. apikey) && ! isnothing (index) " Pinecone context and index not set"
744
754
745
- # TODO : add chunking, embedding, tags?
755
+ # # Split into chunks
756
+ chunks, sources = get_chunks (chunker, files_or_docs;
757
+ chunker_kwargs... )
758
+ # # Get metadata for each chunk
759
+ if isempty (metadata)
760
+ metadata = [Dict {String, Any} () for _ in sources]
761
+ else
762
+ metadata = [metadata[findfirst (f -> f == source, files_or_docs)] for source in sources]
763
+ [metadata[idx][" content" ] = chunk for (idx, chunk) in enumerate (chunks)]
764
+ end
765
+
766
+ # # Embed chunks
767
+ embeddings = get_embeddings (embedder, chunks;
768
+ verbose = (verbose > 1 ),
769
+ cost_tracker,
770
+ api_kwargs, embedder_kwargs... )
771
+
772
+ # # Extract tags
773
+ tags_extracted = get_tags (tagger, chunks;
774
+ verbose = (verbose > 1 ),
775
+ cost_tracker,
776
+ api_kwargs, tagger_kwargs... )
777
+ # Build the sparse matrix and the vocabulary
778
+ tags, tags_vocab = build_tags (tagger, tags_extracted)
779
+
780
+ # Upsert to Pinecone
781
+ if upsert
782
+ embeddings_arr = [embeddings[:,i] for i in axes (embeddings,2 )]
783
+ for (idx, emb) in enumerate (embeddings_arr)
784
+ pinevector = Pinecone. PineconeVector (string (UUIDs. uuid4 ()), emb, metadata[idx])
785
+ Pinecone. upsert (context, index, [pinevector], namespace)
786
+ @info " Upsert #$idx complete"
787
+ end
788
+ end
746
789
747
- index = PineconeIndex (; id = index_id, context, index, namespace)
790
+ index = PineconeIndex (; id = index_id, context, index, namespace, chunks, embeddings, tags, tags_vocab, metadata, sources )
748
791
749
792
(verbose > 0 ) && @info " Index built! (cost: \$ $(round (cost_tracker[], digits= 3 )) )"
750
793
0 commit comments