Skip to content

Commit 29b9335

Browse files
committed
Add chunking, embedding and tagging in build_index
1 parent 4d4b05b commit 29b9335

File tree

4 files changed

+70
-8
lines changed

4 files changed

+70
-8
lines changed

src/Experimental/RAGTools/generation.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,10 @@ const DEFAULT_RAG_CONFIG = RAGConfig()
760760
function airag(index::AbstractDocumentIndex; question::AbstractString, kwargs...)
761761
return airag(DEFAULT_RAG_CONFIG, index; question, kwargs...)
762762
end
763+
const DEFAULT_RAG_CONFIG_PINECONE = RAGConfig(PineconeIndexer(), PineconeRetriever(), AdvancedGenerator())
764+
function airag(index::AbstractManagedIndex; question::AbstractString, kwargs...)
765+
return airag(DEFAULT_RAG_CONFIG_PINECONE, index; question, kwargs...)
766+
end
763767

764768
# Special method to pretty-print the airag results
765769
function PT.pprint(io::IO, airag_result::Tuple{PT.AIMessage, AbstractRAGResult},

src/Experimental/RAGTools/preparation.jl

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ end
147147
Pinecone index to be returned by `build_index`.
148148
"""
149149
@kwdef mutable struct PineconeIndexer <: AbstractIndexBuilder
150-
chunker::AbstractChunker = TextChunker()
150+
chunker::AbstractChunker = FileChunker()
151151
embedder::AbstractEmbedder = SimpleEmbedder()
152152
tagger::AbstractTagger = NoTagger()
153153
end
@@ -720,7 +720,8 @@ function build_index(
720720
return index
721721
end
722722

723-
using Pinecone: Pinecone, PineconeContextv3, PineconeIndexv3, init_v3, Index
723+
using Pinecone: Pinecone, PineconeContextv3, PineconeIndexv3, init_v3, Index, PineconeVector, upsert
724+
using UUIDs: UUIDs, uuid4
724725
# TODO: change docs
725726
"""
726727
build_index(
@@ -733,18 +734,60 @@ using Pinecone: Pinecone, PineconeContextv3, PineconeIndexv3, init_v3, Index
733734
Builds a `PineconeIndex` containing a Pinecone context (API key, index and namespace).
734735
"""
735736
function build_index(
736-
indexer::PineconeIndexer,
737+
indexer::PineconeIndexer, files_or_docs::Vector{<:AbstractString};
738+
metadata::Vector{Dict{String, Any}} = Vector{Dict{String, Any}}(),
737739
context::Pinecone.PineconeContextv3 = Pinecone.init_v3(""),
738740
index::Pinecone.PineconeIndexv3 = nothing,
739741
namespace::AbstractString = "",
742+
upsert::Bool = false,
740743
verbose::Integer = 1,
741744
index_id = gensym(namespace),
745+
chunker::AbstractChunker = indexer.chunker,
746+
chunker_kwargs::NamedTuple = NamedTuple(),
747+
embedder::AbstractEmbedder = indexer.embedder,
748+
embedder_kwargs::NamedTuple = NamedTuple(),
749+
tagger::AbstractTagger = indexer.tagger,
750+
tagger_kwargs::NamedTuple = NamedTuple(),
751+
api_kwargs::NamedTuple = NamedTuple(),
742752
cost_tracker = Threads.Atomic{Float64}(0.0))
743753
@assert !isempty(context.apikey) && !isnothing(index) "Pinecone context and index not set"
744754

745-
# TODO: add chunking, embedding, tags?
755+
## Split into chunks
756+
chunks, sources = get_chunks(chunker, files_or_docs;
757+
chunker_kwargs...)
758+
## Get metadata for each chunk
759+
if isempty(metadata)
760+
metadata = [Dict{String, Any}() for _ in sources]
761+
else
762+
metadata = [metadata[findfirst(f -> f == source, files_or_docs)] for source in sources]
763+
[metadata[idx]["content"] = chunk for (idx, chunk) in enumerate(chunks)]
764+
end
765+
766+
## Embed chunks
767+
embeddings = get_embeddings(embedder, chunks;
768+
verbose = (verbose > 1),
769+
cost_tracker,
770+
api_kwargs, embedder_kwargs...)
771+
772+
## Extract tags
773+
tags_extracted = get_tags(tagger, chunks;
774+
verbose = (verbose > 1),
775+
cost_tracker,
776+
api_kwargs, tagger_kwargs...)
777+
# Build the sparse matrix and the vocabulary
778+
tags, tags_vocab = build_tags(tagger, tags_extracted)
779+
780+
# Upsert to Pinecone
781+
if upsert
782+
embeddings_arr = [embeddings[:,i] for i in axes(embeddings,2)]
783+
for (idx, emb) in enumerate(embeddings_arr)
784+
pinevector = Pinecone.PineconeVector(string(UUIDs.uuid4()), emb, metadata[idx])
785+
Pinecone.upsert(context, index, [pinevector], namespace)
786+
@info "Upsert #$idx complete"
787+
end
788+
end
746789

747-
index = PineconeIndex(; id = index_id, context, index, namespace)
790+
index = PineconeIndex(; id = index_id, context, index, namespace, chunks, embeddings, tags, tags_vocab, metadata, sources)
748791

749792
(verbose > 0) && @info "Index built! (cost: \$$(round(cost_tracker[], digits=3)))"
750793

src/Experimental/RAGTools/retrieval.jl

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,8 +256,6 @@ function find_closest(
256256
pinecone_results_json = JSON3.read(pinecone_results)
257257
matches = pinecone_results_json.matches
258258

259-
# println(matches[1])
260-
261259
# get the chunks / metadata / sources / scores
262260
positions = [1 for _ in matches] # TODO: change this
263261
scores = [m.score for m in matches]

src/Experimental/RAGTools/types.jl

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,14 +139,31 @@ const ChunkIndex = ChunkEmbeddingsIndex
139139
indexid(index::AbstractManagedIndex) = index.id
140140

141141
using Pinecone: Pinecone, PineconeContextv3, PineconeIndexv3
142-
@kwdef struct PineconeIndex <: AbstractManagedIndex
142+
@kwdef struct PineconeIndex{
143+
T1 <: Union{Nothing, AbstractString},
144+
T2 <: Union{Nothing, Matrix{<:Real}},
145+
T3 <: Union{Nothing, AbstractMatrix{<:Bool}}
146+
} <: AbstractManagedIndex
143147
id::Symbol # namespace
144148
context::Pinecone.PineconeContextv3
145149
index::Pinecone.PineconeIndexv3
146150
namespace::String
151+
# underlying document chunks / snippets
152+
chunks::Vector{T1} = nothing
153+
# for semantic search
154+
embeddings::T2 = nothing
155+
# for exact search, filtering, etc.
156+
# expected to be some sparse structure, eg, sparse matrix or nothing
157+
# column oriented, ie, each column is one item in `tags_vocab` and rows are the chunks
158+
tags::T3 = nothing
159+
tags_vocab::Union{Nothing, Vector{<:AbstractString}} = nothing
160+
# metadata for each chunk
161+
metadata::Vector{Dict{String, Any}} = Vector{Dict{String, Any}}()
162+
sources::Union{Nothing, Vector{<:AbstractString}} = nothing
147163
end
148164
HasKeywords(::PineconeIndex) = false
149165
HasEmbeddings(::PineconeIndex) = true
166+
embeddings(index::PineconeIndex) = index.embeddings
150167

151168
abstract type AbstractDocumentTermMatrix end
152169

0 commit comments

Comments
 (0)