diff --git a/featuretools/primitives/standard/transform/natural_language/__init__.py b/featuretools/primitives/standard/transform/natural_language/__init__.py index bfa389e9df..2d66c1c06a 100644 --- a/featuretools/primitives/standard/transform/natural_language/__init__.py +++ b/featuretools/primitives/standard/transform/natural_language/__init__.py @@ -1,3 +1,6 @@ +from featuretools.primitives.standard.transform.natural_language.openai import ( + OpenAIEmbeddings, +) from featuretools.primitives.standard.transform.natural_language.count_string import ( CountString, ) diff --git a/featuretools/primitives/standard/transform/natural_language/openai/__init__.py b/featuretools/primitives/standard/transform/natural_language/openai/__init__.py new file mode 100644 index 0000000000..fa0155f804 --- /dev/null +++ b/featuretools/primitives/standard/transform/natural_language/openai/__init__.py @@ -0,0 +1,3 @@ +from featuretools.primitives.standard.transform.natural_language.openai.embeddings import ( + OpenAIEmbeddings, +) diff --git a/featuretools/primitives/standard/transform/natural_language/openai/embeddings.py b/featuretools/primitives/standard/transform/natural_language/openai/embeddings.py new file mode 100644 index 0000000000..dc352c50a3 --- /dev/null +++ b/featuretools/primitives/standard/transform/natural_language/openai/embeddings.py @@ -0,0 +1,65 @@ +import numpy as np +import pandas as pd +import tiktoken +from openai.embeddings_utils import get_embedding +from woodwork.column_schema import ColumnSchema +from woodwork.logical_types import Double, NaturalLanguage + +from featuretools.primitives.base import TransformPrimitive +from featuretools.primitives.standard.transform.natural_language.openai.model import ( + OpenAIEmbeddingModel, +) + +DEFAULT_MODEL = OpenAIEmbeddingModel( + name="text-embedding-ada-002", + encoding="cl100k_base", + max_tokens=8191, + output_dimensions=1536, +) + + +class OpenAIEmbeddings(TransformPrimitive): + """Generates embeddings using OpenAI. + + Description: + Given list of strings, determine the embeddings for each string, using + the OpenAI model. + + Args: + model (OpenAIEmbeddingModel, optional): The model to use to produce embeddings. + Defaults to "text-embedding-ada-002" if not specified. + + Examples: + >>> x = ['This is a test file', 'This is second line', 'third line $1,000', None] + >>> openai_embeddings = OpenAIEmbeddings() + >>> openai_embeddings(x).tolist() + [4.0, 4.0, 5.0, nan] + """ + + name = "openai_embeddings" + input_types = [ColumnSchema(logical_type=NaturalLanguage)] + return_type = ColumnSchema(logical_type=Double, semantic_tags={"numeric"}) + + def __init__(self, model=DEFAULT_MODEL): + self.model = model + self.number_output_features = model.output_dimensions + + def get_function(self): + encoding = tiktoken.get_encoding(self.model.encoding) + + def is_too_many_tokens(element): + return len(encoding.encode(element)) > self.model.max_tokens + + def get_openai_embeddings(series): + invalid = [np.nan] * self.number_output_features + result = [] + for element in series: + if pd.isnull(element) or is_too_many_tokens(element): + result.append(invalid) + else: + embedding = get_embedding(element, engine=self.model.name) + result.append(embedding) + result = np.array(result).T.tolist() + return pd.Series(result) + + return get_openai_embeddings diff --git a/featuretools/primitives/standard/transform/natural_language/openai/model.py b/featuretools/primitives/standard/transform/natural_language/openai/model.py new file mode 100644 index 0000000000..5d20b8732a --- /dev/null +++ b/featuretools/primitives/standard/transform/natural_language/openai/model.py @@ -0,0 +1,16 @@ +class OpenAIModel(object): + """A model accessible via the OpenAI API.""" + + def __init__(self, name, encoding, max_tokens): + self.name = name + self.encoding = encoding + self.max_tokens = max_tokens + pass + + +class OpenAIEmbeddingModel(OpenAIModel): + """A model accessible via the OpenAI API that can produce embeddings.""" + + def __init__(self, name, encoding, max_tokens, output_dimensions): + self.output_dimensions = output_dimensions + super().__init__(name, encoding, max_tokens) diff --git a/pyproject.toml b/pyproject.toml index b5634871fa..5a2b941bef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,8 @@ dependencies = [ "scipy >= 1.4.0", "tqdm >= 4.32.0", "woodwork[dask] >= 0.18.0", + "openai[embeddings] >= 0.26.5", + "tiktoken >= 0.3.0", ] [project.urls]