Skip to content

Add OpenAI Embeddings Primitive #2502

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from featuretools.primitives.standard.transform.natural_language.openai import (
OpenAIEmbeddings,
)
from featuretools.primitives.standard.transform.natural_language.count_string import (
CountString,
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from featuretools.primitives.standard.transform.natural_language.openai.embeddings import (
OpenAIEmbeddings,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import numpy as np
import pandas as pd
import tiktoken
from openai.embeddings_utils import get_embedding
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import Double, NaturalLanguage

from featuretools.primitives.base import TransformPrimitive
from featuretools.primitives.standard.transform.natural_language.openai.model import (
OpenAIEmbeddingModel,
)

DEFAULT_MODEL = OpenAIEmbeddingModel(
name="text-embedding-ada-002",
encoding="cl100k_base",
max_tokens=8191,
output_dimensions=1536,
)


class OpenAIEmbeddings(TransformPrimitive):
"""Generates embeddings using OpenAI.

Description:
Given list of strings, determine the embeddings for each string, using
the OpenAI model.

Args:
model (OpenAIEmbeddingModel, optional): The model to use to produce embeddings.
Defaults to "text-embedding-ada-002" if not specified.

Examples:
>>> x = ['This is a test file', 'This is second line', 'third line $1,000', None]
>>> openai_embeddings = OpenAIEmbeddings()
>>> openai_embeddings(x).tolist()
[4.0, 4.0, 5.0, nan]
"""

name = "openai_embeddings"
input_types = [ColumnSchema(logical_type=NaturalLanguage)]
return_type = ColumnSchema(logical_type=Double, semantic_tags={"numeric"})

def __init__(self, model=DEFAULT_MODEL):
self.model = model
self.number_output_features = model.output_dimensions

def get_function(self):
encoding = tiktoken.get_encoding(self.model.encoding)

def is_too_many_tokens(element):
return len(encoding.encode(element)) > self.model.max_tokens

def get_openai_embeddings(series):
invalid = [np.nan] * self.number_output_features
result = []
for element in series:
if pd.isnull(element) or is_too_many_tokens(element):
result.append(invalid)
else:
embedding = get_embedding(element, engine=self.model.name)
result.append(embedding)
result = np.array(result).T.tolist()
return pd.Series(result)

return get_openai_embeddings
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
class OpenAIModel(object):
"""A model accessible via the OpenAI API."""

def __init__(self, name, encoding, max_tokens):
self.name = name
self.encoding = encoding
self.max_tokens = max_tokens
pass


class OpenAIEmbeddingModel(OpenAIModel):
"""A model accessible via the OpenAI API that can produce embeddings."""

def __init__(self, name, encoding, max_tokens, output_dimensions):
self.output_dimensions = output_dimensions
super().__init__(name, encoding, max_tokens)
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ dependencies = [
"scipy >= 1.4.0",
"tqdm >= 4.32.0",
"woodwork[dask] >= 0.18.0",
"openai[embeddings] >= 0.26.5",
"tiktoken >= 0.3.0",
]

[project.urls]
Expand Down