diff --git a/authors.yaml b/authors.yaml index 0a576cd229..df32937aea 100644 --- a/authors.yaml +++ b/authors.yaml @@ -380,4 +380,9 @@ alexl-oai: glojain: name: "Glory Jain" website: "https://www.linkedin.com/in/gloryjain/" - avatar: "https://media.licdn.com/dms/image/v2/C4E03AQH72n6Sm5q69Q/profile-displayphoto-shrink_400_400/profile-displayphoto-shrink_400_400/0/1557995338725?e=1756339200&v=beta&t=FGTXiCZwTZvqHCY-wd8It15EDf11Rex1oLlBKRGHNtY" \ No newline at end of file + avatar: "https://media.licdn.com/dms/image/v2/C4E03AQH72n6Sm5q69Q/profile-displayphoto-shrink_400_400/profile-displayphoto-shrink_400_400/0/1557995338725?e=1756339200&v=beta&t=FGTXiCZwTZvqHCY-wd8It15EDf11Rex1oLlBKRGHNtY" + +dhghomon: + name: "Dave MacLeod" + website: "https://www.linkedin.com/in/davemacleod/" + avatar: "https://media.licdn.com/dms/image/v2/C5603AQEb2B7UHzrsMA/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1646798838220?e=1756339200&v=beta&t=paU69ZEH97HPUqFDgNTE7jzA4eQldfhsWGP6N-r0x9g" \ No newline at end of file diff --git a/examples/vector_databases/README.md b/examples/vector_databases/README.md index ebbb8fee0e..228b5ed8f1 100644 --- a/examples/vector_databases/README.md +++ b/examples/vector_databases/README.md @@ -25,6 +25,7 @@ Each provider has their own named directory, with a standard notebook to introdu - [Redis](https://github.com/RedisVentures/simple-vecsim-intro) - [SingleStoreDB](https://www.singlestore.com/blog/how-to-get-started-with-singlestore/) - [Supabase](https://supabase.com/docs/guides/ai) +- [SurrealDB](https://surrealdb.com/solutions/ai) - [Tembo](https://tembo.io/docs/product/stacks/ai/vectordb) - [Typesense](https://typesense.org/docs/guide/) - [Vespa AI](https://vespa.ai/) diff --git a/examples/vector_databases/surrealdb/README.md b/examples/vector_databases/surrealdb/README.md new file mode 100644 index 0000000000..53b0943fbd --- /dev/null +++ b/examples/vector_databases/surrealdb/README.md @@ -0,0 +1,19 @@ +# SurrealDB + +[SurrealDB](https://www.surrealdb.com/docs) is an open-source multi-model database built in Rust with natively supported AI-facing functionality such as [full text](https://surrealdb.com/docs/surrealdb/reference-guide/full-text-search), [vector search](https://surrealdb.com/docs/surrealdb/reference-guide/vector-search), and [graph queries](https://surrealdb.com/docs/surrealdb/models/graph). + +## OpenAI Cookbook Examples + +Below are guides and resources that walk you through how to use OpenAI embedding models with SurrealDB. + +| Guide | Description | +| ---------------------------------------- | ---------------------------------------------------------- | +| [Semantic search](./semantic-search.mdx) | Store, index, and query embeddings using built-in vectors. | + +## Additional resources + +- [Moving from full text to vector search](https://surrealdb.com/blog/moving-from-full-text-search-to-vector-search-in-surrealdb) +- [Vector search example](https://surrealdb.com/blog/find-your-celebrity-soulmate-with-the-magic-of-vector-search) +- [RAG using in-database embeddings](https://surrealdb.com/blog/cooking-up-faster-rag-using-in-database-embeddings-in-surrealdb) +- [Getting started with SurrealML](https://surrealdb.com/blog/what-is-surrealml-a-getting-started-guide) +- [Building real-time AI pipelines](https://surrealdb.com/blog/building-real-time-ai-pipelines-in-surrealdb) \ No newline at end of file diff --git a/examples/vector_databases/surrealdb/semantic-search.mdx b/examples/vector_databases/surrealdb/semantic-search.mdx new file mode 100644 index 0000000000..c9ae50500a --- /dev/null +++ b/examples/vector_databases/surrealdb/semantic-search.mdx @@ -0,0 +1,474 @@ +# Semantic search using SurrealDB + +This blog post is a SurrealDB-ified version of [this great post](https://github.com/openai/openai-cookbook/blob/b35868e67bb662fbf294b425920f80aacf5a363c/examples/vector_databases/supabase/semantic-search.mdx) by [Greg Richardson](https://github.com/gregnr) for the [OpenAI cookbook](https://cookbook.openai.com/). Thanks for the great post! + +## Intro + +The purpose of this guide is to demonstrate how to store OpenAI embeddings as [SurrealDB vectors](https://www.surrealdb.com/docs/surrealdb/reference-guide/vector-search) via the Rust SDK for the purposes of semantic search. + +This guide uses Rust's [async-openai](https://crates.io/crates/async-openai/0.28.3) crate to generate embeddings, but you can modify it to use any [language supported by OpenAI](https://platform.openai.com/docs/libraries). + +This guide covers: + +- [Semantic search using SurrealDB](#semantic-search-using-surrealdb) + - [Intro](#intro) + - [Setup](#setup) + - [Create a vector table](#create-a-vector-table) + - [Generate OpenAI embeddings](#generate-openai-embeddings) + - [Store embeddings in database](#store-embeddings-in-database) + - [Semantic search](#semantic-search) + +## Setup + +Setting up an embedded SurrealDB database only takes a few lines of code. After creating a new Cargo project with `cargo new project_name` and going into the project folder, we will then add the following dependencies inside `Cargo.toml`: + +```toml +anyhow = "1.0.98" +async-openai = "0.28.3" +serde = "1.0.219" +surrealdb = { version = "2.3", features = ["kv-mem"] } +tokio = "1.45.0" +``` + +They can also be added on the command line using this command: + +``` +cargo add anyhow async-openai serde tokio surrealdb --features surrealdb/kv-mem +``` + +Inside `main()`, we can call the `connect` function with `"memory"` to instantiate an embedded database in memory. With the possibility of error types from various sources, using `anyhow` is the easiest way to get started. + +```rust +use anyhow::Error; +use surrealdb::engine::any::connect; + +#[tokio::main] +async fn main() -> Result<(), Error> { + let db = connect("memory").await?; + Ok(()) +} +``` + +If you have a Cloud or local instance to connect to, you can pass that path into the connect function instead. + +```rust +// Cloud address +let db = connect("wss://cloud-docs-068rp16e0hsnl62vgooa7omjks.aws-euw1.staging.surrealdb.cloud").await?; + +// Local address +let db = connect("ws://localhost:8000").await?; +``` + +After connecting, we will select a namespace and database name, such as `ns` and `db`. + +```rust +db.use_ns("ns").use_db("db").await?; +``` + +## Create a vector table + +Next we'll create a table to store documents and embeddings, along with an index for the embeddings. The statements look like this: + +```surql +DEFINE TABLE document; +DEFINE FIELD text ON document TYPE string; +DEFINE FIELD embedding ON document TYPE array; +DEFINE INDEX hnsw_embed ON document FIELDS embedding HNSW DIMENSION 1536 DIST COSINE; +``` + +Inside the SDK we can put all four of these inside a single `.query()` call and then add a line to see if there are errors inside any of them. + +```rust +let mut res = db + .query( + "DEFINE TABLE document; +DEFINE FIELD text ON document TYPE string; +DEFINE FIELD embedding ON document TYPE array; +DEFINE INDEX hnsw_embed ON document FIELDS embedding HNSW DIMENSION 1536 DIST COSINE;", + ) + .await?; +for (index, error) in res.take_errors() { + println!("Error in query {index}: {error}"); +} +``` + +The important piece to understand is the relationship between the `embedding` field, a simple array of floats, and the `hnsw_embed` index. The size of the vector (1536 here) represents the number of dimensions in the embedding. Since OpenAI's `text-embedding-3-small` model in this example uses [1536 as its default length](https://platform.openai.com/docs/guides/embeddings), we set the vector size to 1536. + +The [HNSW index](https://www.surrealdb.com/docs/surrealdb/reference-guide/vector-search#vector-indexes) is not strictly necessary to use the KNN operator (`<||>`) to find an embedding's closest neighbours, and for our small sample code we will use the simple [brute force method](https://www.surrealdb.com/docs/surrealql/operators#brute-force-method) which chooses [an algorithm](https://www.surrealdb.com/docs/surrealdb/reference-guide/vector-search#computation-on-vectors-vector-package-of-functions) such as Euclidean, Hamming, and so on. The following is the code that we will use, which uses the cosine of an embedding to find the four closest neighbours. + +```surql +SELECT + text, + vector::distance::knn() AS distance FROM document + WHERE embedding <|2,COSINE|> $embeds + ORDER BY distance; +``` + +As the dataset grows, if some loss of accuracy is acceptable then the syntax can be changed to use [the HNSW index](https://www.surrealdb.com/docs/surrealql/operators#hnsw-method), by replacing an algorithm with a number that represents the size of the dynamic candidate list. + +```surql +SELECT + text, + vector::distance::knn() AS distance FROM document + WHERE embedding <|2,40|> $embeds + ORDER BY distance; +``` + +Another option is to use the [MTREE](https://www.surrealdb.com/docs/surrealql/operators#mtree-index-method) index method. + +## Generate OpenAI embeddings + +At this point, you will need an [OpenAI API key](https://platform.openai.com/api-keys) to interact with the OpenAI API. You can still check the code to see if it works if you don't have a key, and you will get as far as this error message. + +``` +Error: invalid_request_error: Incorrect API key provided: blah. You can find your API key at https://platform.openai.com/account/api-keys. (code: invalid_api_key) +``` + +The best way to set the key is as an environment variable, `OPENAI_API_KEY` in this case. Using a `LazyLock` will let us call it via `std::env::var()` function the first time it is accessed. You can of course simply put it into a `const` for simplicity when first testing, but always remember to never hard-code API keys in your code in production. + +```rust +static KEY: LazyLock = LazyLock::new(|| { + std::env::var("OPENAI_API_KEY").unwrap() +}); +``` + +And then run the code like this: + +```bash +OPENAI_API_KEY=whateverthekeyis cargo run +``` + +Or like this if you are using PowerShell on Windows. + +```powershell +$env:OPENAI_API_KEY = "whateverthekeyis" +cargo run +``` + +Inside `main()`, we will then [create a client](https://docs.rs/async-openai/0.28.3/async_openai/struct.Client.html) from the async-openai crate holding this config inside `main()`. + +```rust +let config = OpenAIConfig::new().with_api_key(KEY); +let client = Client::with_config(config); +``` + +We'll use that to generate an OpenAI embedding using [`text-embedding-3-small`](https://platform.openai.com/docs/guides/embeddings/embedding-models), as follows. + +```rust +let input = "What does the cat chase?"; + +let request = CreateEmbeddingRequestArgs::default() + .model("text-embedding-3-small") + .input(input) + .dimensions(1536u32) + .build()?; +let result = client.embeddings().create(request).await?; +println!("{result:?}"); +``` + +The output in your console should show a massive amount of floats, 1536 of them to be precise. That's the embedding for this input! + +## Store embeddings in database + +Now that we have the embedding returned from the OpenAI client, we can store it in the database. The [response](https://docs.rs/async-openai/0.28.3/async_openai/types/struct.CreateEmbeddingResponse.html) returned from the async-openai crate looks like this, with a `Vec` of `Embedding` structs that hold a `Vec`. + +```rust +pub struct CreateEmbeddingResponse { + pub object: String, + pub model: String, + pub data: Vec, + pub usage: EmbeddingUsage, +} + +pub struct Embedding { + pub index: u32, + pub object: String, + pub embedding: Vec, +} +``` + +We know that our simple request only returned a single embedding, so `.remove(0)` will do the job. In a more complex codebase you would probably opt for a match on `.get(0)` to handle any possible errors. + +```rust +let embeds = result.data.remove(0).embedding; +``` + +There are a [number of ways](https://www.surrealdb.com/docs/sdk/rust/concepts/flexible-typing) to work with or avoid structs when using the Rust SDK, but we'll just go with two basic structs: one to represent the input into a `.create()` statement, which will implement `Serialize`, and another that implements `Deserialize` to show the result. + +```rust +#[derive(Serialize)] +struct DocumentInput { + text: String, + embedding: Vec, +} + +#[derive(Debug, Deserialize)] +struct Document { + id: RecordId, + embedding: Vec, + text: String, +} +``` + +Once that is done, we can print out the created documents as a `Document` struct. + +```rust +let in_db = db + .create::>("document") + .content(DocumentInput { + text: input.into(), + embedding: embeds.to_vec() + }) + .await?; +println!("{in_db:?}"); +``` + +We should now add some more `document` records. To do this, we'll move the logic to create them inside a function of its own: + +```rust +async fn create_embed( + input: &str, + db: &Surreal, + client: &Client, +) -> Result<(), Error> { + let request = CreateEmbeddingRequestArgs::default() + .model("text-embedding-3-small") + .input(input) + .dimensions(1536u32) + .build()?; + let result = client.embeddings().create(request).await?; + + let embeds = &result.data.get(0).unwrap().embedding; + + let _in_db = db + .create::>("document") + .content(DocumentInput { + text: input.into(), + embedding: embeds.to_vec(), + }) + .await?; + Ok(()) +} +``` + +And then call it a few times inside `main()`. See if you can guess the answers yourself! + +```rust +for input in [ + "What does the cat chase?", + "What do Fraggles love to eat?", + "Which planet rotates slowly on its axis?", + "Which Greek general helped Cyrus the Younger?", + "What is the largest inland sea?"] { + create_embed(input, &db, &client).await? +} +``` + +## Semantic search + +Finally let's perform semantic search over the embeddings in our database. + +With that done, it's time to test the database out. We'll go with this query that uses the KNN operator to return the closest two matches to an embedding. + +```surql +SELECT + text, + vector::distance::knn() AS distance FROM document + WHERE embedding <|2,COSINE|> $embeds + ORDER BY distance; +``` + +You can customise this [with other algorithms](https://www.surrealdb.com/docs/surrealdb/reference-guide/vector-search#computation-on-vectors-vector-package-of-functions) such as Euclidean, Hamming, and so on. + +We will then put this into a separate function called `test_embed()` which looks similar `create_embed()`, except that it uses its embedding retrieved from OpenAI to query the database against existing documents instead of creating a new document. + +```rust +async fn test_embed( + input: &str, + db: &Surreal, + client: &Client, +) -> Result<(), Error> { + let request = CreateEmbeddingRequestArgs::default() + .model("text-embedding-3-small") + .input(input) + .dimensions(1536u32) + .build()?; + let mut result = client.embeddings().create(request).await?; + + let embeds = result.data.remove(0).embedding; + + let mut response = db.query("SELECT text, vector::distance::knn() AS distance FROM document WHERE embedding <|2,COSINE|> $embeds ORDER BY distance;").bind(("embeds", embeds)).await?; + let as_val: Value = response.take(0)?; + println!("{as_val}\n"); + Ok(()) +} +``` + +Finally, we will call this function a few times inside `main()` to confirm that the results are what we expect them to be, printing out the results of each so that we can eyeball them and make sure that they are what we expect them to be. + +```rust +println!("Venus is closest to:"); +test_embed("Venus", &db, &client).await?; + +println!("Xenophon is closest to:"); +test_embed("Xenophon", &db, &client).await?; + +println!("Mice are closest to:"); +test_embed("mouse", &db, &client).await?; + +println!("Radishes are closest to:"); +test_embed("radish", &db, &client).await?; + +println!("The Caspian Sea is closest to:"); +test_embed("Caspian Sea", &db, &client).await?; +``` + +The output shows that in each case the closest document is returned first: + +* "Venus" to "Which planet rotates slowly on its axis?" +* "Xenophon" to "Which Greek general helped Cyrus the Younger?" +* "mouse" to "What does the cat chase?" +* "radish" to "What do Fraggles love to eat?", and +* "Caspian Sea" to "What is the largest inland sea?" + +Success! + +``` +Venus is closest to: +[{ distance: 0.6495068000978139f, text: 'Which planet rotates slowly on its axis?' }, { distance: 0.8388033444017572f, text: 'Which Greek general helped Cyrus the Younger?' }] + +Xenophon is closest to: +[{ distance: 0.4421917772479055f, text: 'Which Greek general helped Cyrus the Younger?' }, { distance: 0.873354690471173f, text: 'What does the cat chase?' }] + +Mice are closest to: +[{ distance: 0.6945913095506092f, text: 'What does the cat chase?' }, { distance: 0.8249335430462937f, text: 'Which planet rotates slowly on its axis?' }] + +Radishes are closest to: +[{ distance: 0.7256996315669555f, text: 'What do Fraggles love to eat?' }, { distance: 0.8812784798259233f, text: 'What does the cat chase?' }] + +The Caspian Sea is closest to: +[{ distance: 0.49966454922547254f, text: 'What is the largest inland sea?' }, { distance: 0.8096568276647603f, text: 'Which Greek general helped Cyrus the Younger?' }] +``` + +Finally, here is all of the code for you to run and modify as you wish. Any questions or thoughts about this or semantic search using SurrealDB? Feel free to [drop by our community](https://www.surrealdb.com/community) to get in touch. + +```rust +use std::sync::LazyLock; + +use anyhow::Error; +use async_openai::{Client, config::OpenAIConfig, types::CreateEmbeddingRequestArgs}; +use serde::{Deserialize, Serialize}; +use surrealdb::{ + RecordId, Surreal, Value, + engine::any::{Any, connect}, +}; + +static KEY: LazyLock = LazyLock::new(|| std::env::var("OPENAI_API_KEY").unwrap()); + +#[derive(Serialize)] +struct DocumentInput { + text: String, + embedding: Vec, +} + +#[derive(Debug, Deserialize)] +struct Document { + id: RecordId, + embedding: Vec, + text: String, +} + +async fn create_embed( + input: &str, + db: &Surreal, + client: &Client, +) -> Result<(), Error> { + let request = CreateEmbeddingRequestArgs::default() + .model("text-embedding-3-small") + .input(input) + .dimensions(1536u32) + .build()?; + let mut result = client.embeddings().create(request).await?; + + let embeds = result.data.remove(0).embedding; + + let _in_db = db + .create::>("document") + .content(DocumentInput { + text: input.into(), + embedding: embeds.to_vec(), + }) + .await?; + Ok(()) +} + +async fn test_embed( + input: &str, + db: &Surreal, + client: &Client, +) -> Result<(), Error> { + let request = CreateEmbeddingRequestArgs::default() + .model("text-embedding-3-small") + .input(input) + .dimensions(1536u32) + .build()?; + let mut result = client.embeddings().create(request).await?; + + let embeds = result.data.remove(0).embedding; + + let mut response = db.query("SELECT text, vector::distance::knn() AS distance FROM document WHERE embedding <|2,COSINE|> $embeds ORDER BY distance;").bind(("embeds", embeds)).await?; + let as_val: Value = response.take(0)?; + println!("{as_val}\n"); + Ok(()) +} + +#[tokio::main] +async fn main() -> Result<(), Error> { + let db = connect("memory").await?; + + db.use_ns("ns").use_db("db").await?; + + let mut res = db + .query( + "DEFINE TABLE document; + DEFINE FIELD text ON document TYPE string; + DEFINE FIELD embedding ON document TYPE array; + DEFINE INDEX hnsw_embed ON document FIELDS embedding HNSW DIMENSION 1536 DIST COSINE;", + ) + .await?; + for (index, error) in res.take_errors() { + println!("Error in query {index}: {error}"); + } + + let config = OpenAIConfig::new().with_api_key(&*KEY); + + let client = Client::with_config(config); + + for input in [ + "What does the cat chase?", + "What do Fraggles love to eat?", + "Which planet rotates slowly on its axis?", + "Which Greek general helped Cyrus the Younger?", + "What is the largest inland sea?", + ] { + create_embed(input, &db, &client).await? + } + + println!("Venus is closest to:"); + test_embed("Venus", &db, &client).await?; + + println!("Xenophon is closest to:"); + test_embed("Xenophon", &db, &client).await?; + + println!("Mice are closest to:"); + test_embed("mouse", &db, &client).await?; + + println!("Radishes are closest to:"); + test_embed("radish", &db, &client).await?; + + println!("The Caspian Sea is closest to:"); + test_embed("Caspian Sea", &db, &client).await?; + + Ok(()) +} +``` \ No newline at end of file diff --git a/registry.yaml b/registry.yaml index a8d021051a..cc1578d665 100644 --- a/registry.yaml +++ b/registry.yaml @@ -2253,3 +2253,19 @@ - deep-research - agents - agents-sdk + +- title: SurrealDB + path: examples/vector_databases/surrealdb/README.md + date: 2025-06-27 + authors: + - dghomon + tags: + - embeddings + +- title: Semantic search using SurrealDB + path: examples/vector_databases/surrealdb/semantic-search.mdx + date: 2025-06-27 + authors: + - dhghomon + tags: + - embeddings \ No newline at end of file