Skip to content

Commit 1c4e0f5

Browse files
committed
Add chunking, embedding and tagging in build_index
1 parent 87c56b3 commit 1c4e0f5

File tree

4 files changed

+70
-8
lines changed

4 files changed

+70
-8
lines changed

src/Experimental/RAGTools/generation.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,10 @@ const DEFAULT_RAG_CONFIG = RAGConfig()
760760
function airag(index::AbstractDocumentIndex; question::AbstractString, kwargs...)
761761
return airag(DEFAULT_RAG_CONFIG, index; question, kwargs...)
762762
end
763+
const DEFAULT_RAG_CONFIG_PINECONE = RAGConfig(PineconeIndexer(), PineconeRetriever(), AdvancedGenerator())
764+
function airag(index::AbstractManagedIndex; question::AbstractString, kwargs...)
765+
return airag(DEFAULT_RAG_CONFIG_PINECONE, index; question, kwargs...)
766+
end
763767

764768
# Special method to pretty-print the airag results
765769
function PT.pprint(io::IO, airag_result::Tuple{PT.AIMessage, AbstractRAGResult},

src/Experimental/RAGTools/preparation.jl

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ end
147147
Pinecone index to be returned by `build_index`.
148148
"""
149149
@kwdef mutable struct PineconeIndexer <: AbstractIndexBuilder
150-
chunker::AbstractChunker = TextChunker()
150+
chunker::AbstractChunker = FileChunker()
151151
embedder::AbstractEmbedder = SimpleEmbedder()
152152
tagger::AbstractTagger = NoTagger()
153153
end
@@ -726,7 +726,8 @@ function build_index(
726726
return index
727727
end
728728

729-
using Pinecone: Pinecone, PineconeContextv3, PineconeIndexv3, init_v3, Index
729+
using Pinecone: Pinecone, PineconeContextv3, PineconeIndexv3, init_v3, Index, PineconeVector, upsert
730+
using UUIDs: UUIDs, uuid4
730731
# TODO: change docs
731732
"""
732733
build_index(
@@ -739,18 +740,60 @@ using Pinecone: Pinecone, PineconeContextv3, PineconeIndexv3, init_v3, Index
739740
Builds a `PineconeIndex` containing a Pinecone context (API key, index and namespace).
740741
"""
741742
function build_index(
742-
indexer::PineconeIndexer,
743+
indexer::PineconeIndexer, files_or_docs::Vector{<:AbstractString};
744+
metadata::Vector{Dict{String, Any}} = Vector{Dict{String, Any}}(),
743745
context::Pinecone.PineconeContextv3 = Pinecone.init_v3(""),
744746
index::Pinecone.PineconeIndexv3 = nothing,
745747
namespace::AbstractString = "",
748+
upsert::Bool = false,
746749
verbose::Integer = 1,
747750
index_id = gensym(namespace),
751+
chunker::AbstractChunker = indexer.chunker,
752+
chunker_kwargs::NamedTuple = NamedTuple(),
753+
embedder::AbstractEmbedder = indexer.embedder,
754+
embedder_kwargs::NamedTuple = NamedTuple(),
755+
tagger::AbstractTagger = indexer.tagger,
756+
tagger_kwargs::NamedTuple = NamedTuple(),
757+
api_kwargs::NamedTuple = NamedTuple(),
748758
cost_tracker = Threads.Atomic{Float64}(0.0))
749759
@assert !isempty(context.apikey) && !isnothing(index) "Pinecone context and index not set"
750760

751-
# TODO: add chunking, embedding, tags?
761+
## Split into chunks
762+
chunks, sources = get_chunks(chunker, files_or_docs;
763+
chunker_kwargs...)
764+
## Get metadata for each chunk
765+
if isempty(metadata)
766+
metadata = [Dict{String, Any}() for _ in sources]
767+
else
768+
metadata = [metadata[findfirst(f -> f == source, files_or_docs)] for source in sources]
769+
[metadata[idx]["content"] = chunk for (idx, chunk) in enumerate(chunks)]
770+
end
771+
772+
## Embed chunks
773+
embeddings = get_embeddings(embedder, chunks;
774+
verbose = (verbose > 1),
775+
cost_tracker,
776+
api_kwargs, embedder_kwargs...)
777+
778+
## Extract tags
779+
tags_extracted = get_tags(tagger, chunks;
780+
verbose = (verbose > 1),
781+
cost_tracker,
782+
api_kwargs, tagger_kwargs...)
783+
# Build the sparse matrix and the vocabulary
784+
tags, tags_vocab = build_tags(tagger, tags_extracted)
785+
786+
# Upsert to Pinecone
787+
if upsert
788+
embeddings_arr = [embeddings[:,i] for i in axes(embeddings,2)]
789+
for (idx, emb) in enumerate(embeddings_arr)
790+
pinevector = Pinecone.PineconeVector(string(UUIDs.uuid4()), emb, metadata[idx])
791+
Pinecone.upsert(context, index, [pinevector], namespace)
792+
@info "Upsert #$idx complete"
793+
end
794+
end
752795

753-
index = PineconeIndex(; id = index_id, context, index, namespace)
796+
index = PineconeIndex(; id = index_id, context, index, namespace, chunks, embeddings, tags, tags_vocab, metadata, sources)
754797

755798
(verbose > 0) && @info "Index built! (cost: \$$(round(cost_tracker[], digits=3)))"
756799

src/Experimental/RAGTools/retrieval.jl

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,8 +256,6 @@ function find_closest(
256256
pinecone_results_json = JSON3.read(pinecone_results)
257257
matches = pinecone_results_json.matches
258258

259-
# println(matches[1])
260-
261259
# get the chunks / metadata / sources / scores
262260
positions = [1 for _ in matches] # TODO: change this
263261
scores = [m.score for m in matches]

src/Experimental/RAGTools/types.jl

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,14 +139,31 @@ const ChunkIndex = ChunkEmbeddingsIndex
139139
indexid(index::AbstractManagedIndex) = index.id
140140

141141
using Pinecone: Pinecone, PineconeContextv3, PineconeIndexv3
142-
@kwdef struct PineconeIndex <: AbstractManagedIndex
142+
@kwdef struct PineconeIndex{
143+
T1 <: Union{Nothing, AbstractString},
144+
T2 <: Union{Nothing, Matrix{<:Real}},
145+
T3 <: Union{Nothing, AbstractMatrix{<:Bool}}
146+
} <: AbstractManagedIndex
143147
id::Symbol # namespace
144148
context::Pinecone.PineconeContextv3
145149
index::Pinecone.PineconeIndexv3
146150
namespace::String
151+
# underlying document chunks / snippets
152+
chunks::Vector{T1} = nothing
153+
# for semantic search
154+
embeddings::T2 = nothing
155+
# for exact search, filtering, etc.
156+
# expected to be some sparse structure, eg, sparse matrix or nothing
157+
# column oriented, ie, each column is one item in `tags_vocab` and rows are the chunks
158+
tags::T3 = nothing
159+
tags_vocab::Union{Nothing, Vector{<:AbstractString}} = nothing
160+
# metadata for each chunk
161+
metadata::Vector{Dict{String, Any}} = Vector{Dict{String, Any}}()
162+
sources::Union{Nothing, Vector{<:AbstractString}} = nothing
147163
end
148164
HasKeywords(::PineconeIndex) = false
149165
HasEmbeddings(::PineconeIndex) = true
166+
embeddings(index::PineconeIndex) = index.embeddings
150167

151168
abstract type AbstractDocumentTermMatrix end
152169

0 commit comments

Comments
 (0)