diff --git a/bin/openai b/bin/openai index 5fb88a4aca..8bfbe5ac1e 100755 --- a/bin/openai +++ b/bin/openai @@ -1,13 +1,11 @@ #!/usr/bin/env python import argparse -import json import logging -import os import sys import openai from openai.cli import display_error -from openai.cli import register as api_register +from openai.cli import api_register, tools_register logger = logging.getLogger() formatter = logging.Formatter("[%(asctime)s] %(message)s") @@ -40,9 +38,11 @@ def main(): parser.set_defaults(func=help) subparsers = parser.add_subparsers() - sub = subparsers.add_parser("api", help="Direct API calls") + sub_api = subparsers.add_parser("api", help="Direct API calls") + sub_tools = subparsers.add_parser("tools", help="Client side tools for convenience") - api_register(sub) + api_register(sub_api) + tools_register(sub_tools) args = parser.parse_args() if args.verbosity == 1: diff --git a/openai/api_resources/completion.py b/openai/api_resources/completion.py index ad56d270a5..f39ef8f9f4 100644 --- a/openai/api_resources/completion.py +++ b/openai/api_resources/completion.py @@ -19,7 +19,7 @@ def create(cls, *args, **kwargs): of valid parameters. """ start = time.time() - timeout = kwargs.get("timeout", None) + timeout = kwargs.pop("timeout", None) if kwargs.get("model", None) is None and kwargs.get("engine", None) is None: raise InvalidRequestError( "Must provide an 'engine' or 'model' parameter to create a Completion.", diff --git a/openai/api_resources/file.py b/openai/api_resources/file.py index 7b9a03a56c..4a5feb92d9 100644 --- a/openai/api_resources/file.py +++ b/openai/api_resources/file.py @@ -1,5 +1,8 @@ from __future__ import absolute_import, division, print_function +import json +import os + import openai from openai import api_requestor, util from openai.api_resources.abstract import ( @@ -29,3 +32,59 @@ def create( return util.convert_to_openai_object( response, api_key, api_version, organization ) + + @classmethod + def download( + cls, id, api_key=None, api_base=None, api_version=None, organization=None + ): + requestor = api_requestor.APIRequestor( + api_key, + api_base=api_base or openai.file_api_base or openai.api_base, + api_version=api_version, + organization=organization, + ) + url = f"{cls.class_url()}/{id}/content" + rbody, rcode, rheaders, _, _ = requestor.request_raw("get", url) + if not 200 <= rcode < 300: + raise requestor.handle_error_response( + rbody, rcode, json.loads(rbody), rheaders, stream_error=False + ) + return rbody + + @classmethod + def find_matching_files( + cls, + api_key=None, + api_base=None, + api_version=None, + organization=None, + file=None, + purpose=None, + ): + if file is None: + raise openai.error.InvalidRequestError( + "'file' is a required property", "file" + ) + if purpose is None: + raise openai.error.InvalidRequestError( + "'purpose' is a required property", "purpose" + ) + all_files = cls.list( + api_key=api_key, + api_base=api_base or openai.file_api_base or openai.api_base, + api_version=api_version, + organization=organization, + ).get("data", []) + matching_files = [] + for f in all_files: + if f["purpose"] != purpose: + continue + if not hasattr(file, "name") or f["filename"] != file.name: + continue + file.seek(0, os.SEEK_END) + if f["bytes"] != file.tell(): + file.seek(0) + continue + file.seek(0) + matching_files.append(f) + return matching_files diff --git a/openai/cli.py b/openai/cli.py index 2802f32796..2c70870361 100644 --- a/openai/cli.py +++ b/openai/cli.py @@ -1,9 +1,15 @@ import datetime -import json import os import signal import sys import warnings +from openai.validators import ( + write_out_file, + apply_necessary_remediation, + apply_optional_remediation, + read_any_format, + get_validators, +) import openai @@ -221,12 +227,48 @@ def list(cls, args): print(resp) @classmethod - def _get_or_upload(cls, file): + def _get_or_upload(cls, file, check_if_file_exists=True): try: openai.File.retrieve(file) except openai.error.InvalidRequestError as e: if e.http_status == 404 and os.path.isfile(file): - resp = openai.File.create(file=open(file), purpose="fine-tune") + matching_files = openai.File.find_matching_files( + file=open(file), purpose="fine-tune" + ) + if len(matching_files) > 0 and check_if_file_exists: + file_ids = [f["id"] for f in matching_files] + sys.stdout.write( + "Found potentially duplicated files with name '{name}', purpose 'fine-tune' and size {size} bytes\n".format( + name=matching_files[0]["filename"], + size=matching_files[0]["bytes"], + ) + ) + sys.stdout.write("\n".join(file_ids)) + while True: + sys.stdout.write( + "\nEnter file ID to reuse an already uploaded file, or an empty string to upload this file anyway: " + ) + inp = sys.stdin.readline().strip() + if inp in file_ids: + sys.stdout.write( + "Using your file {file}: {id}\n".format( + file=file, id=inp + ) + ) + return inp + elif inp == "": + break + else: + sys.stdout.write( + "File id '{id}' is not among the IDs of the potentially duplicated files\n".format( + id=inp + ) + ) + + resp = openai.File.create( + file=open(file), + purpose="fine-tune", + ) sys.stdout.write( "Uploaded file from {file}: {id}\n".format(file=file, id=resp["id"]) ) @@ -236,21 +278,38 @@ def _get_or_upload(cls, file): @classmethod def create(cls, args): create_args = { - "training_file": cls._get_or_upload(args.training_file), + "training_file": cls._get_or_upload( + args.training_file, args.check_if_files_exist + ), } if args.validation_file: - create_args["validation_file"] = cls._get_or_upload(args.validation_file) + create_args["validation_file"] = cls._get_or_upload( + args.validation_file, args.check_if_files_exist + ) if args.model: create_args["model"] = args.model - if args.hparams: - try: - hparams = json.loads(args.hparams) - except json.decoder.JSONDecodeError: - sys.stderr.write( - "--hparams must be JSON decodable and match the hyperparameter arguments of the API" - ) - sys.exit(1) - create_args.update(hparams) + if args.n_epochs: + create_args["n_epochs"] = args.n_epochs + if args.batch_size: + create_args["batch_size"] = args.batch_size + if args.learning_rate_multiplier: + create_args["learning_rate_multiplier"] = args.learning_rate_multiplier + create_args["use_packing"] = args.use_packing + if args.prompt_loss_weight: + create_args["prompt_loss_weight"] = args.prompt_loss_weight + if args.compute_classification_metrics: + create_args[ + "compute_classification_metrics" + ] = args.compute_classification_metrics + if args.classification_n_classes: + create_args["classification_n_classes"] = args.classification_n_classes + if args.classification_positive_class: + create_args[ + "classification_positive_class" + ] = args.classification_positive_class + if args.classification_betas: + betas = [float(x) for x in args.classification_betas.split(",")] + create_args["classification_betas"] = betas resp = openai.FineTune.create(**create_args) @@ -271,6 +330,18 @@ def create(cls, args): def get(cls, args): resp = openai.FineTune.retrieve(id=args.id) print(resp) + print(resp["result_files"][0]) + + @classmethod + def results(cls, args): + fine_tune = openai.FineTune.retrieve(id=args.id) + if "result_files" not in fine_tune or len(fine_tune["result_files"]) == 0: + raise openai.error.InvalidRequestError( + f"No results file available for fine-tune {args.id}", "id" + ) + result_file = openai.FineTune.retrieve(id=args.id)["result_files"][0] + resp = openai.File.download(id=result_file["id"]) + print(resp.decode("utf-8")) @classmethod def events(cls, args): @@ -329,8 +400,69 @@ def cancel(cls, args): resp = openai.FineTune.cancel(id=args.id) print(resp) + @classmethod + def prepare_data(cls, args): + + sys.stdout.write("Analyzing...\n") + fname = args.file + df, remediation = read_any_format(fname) + apply_necessary_remediation(None, remediation) + + validators = get_validators() + + optional_remediations = [] + if remediation is not None: + optional_remediations.append(remediation) + for validator in validators: + remediation = validator(df) + if remediation is not None: + optional_remediations.append(remediation) + df = apply_necessary_remediation(df, remediation) + + any_optional_or_necessary_remediations = any( + [ + remediation + for remediation in optional_remediations + if remediation.optional_msg is not None + or remediation.necessary_msg is not None + ] + ) + + if any_optional_or_necessary_remediations: + sys.stdout.write( + "\n\nBased on the analysis we will perform the following actions:\n" + ) + + for remediation in optional_remediations: + df = apply_optional_remediation(df, remediation) + else: + sys.stdout.write("\n\nNo remediations found.\n") + + write_out_file(df, fname, any_optional_or_necessary_remediations) + -def register(parser): +def tools_register(parser): + subparsers = parser.add_subparsers( + title="Tools", help="Convenience client side tools" + ) + + def help(args): + parser.print_help() + + parser.set_defaults(func=help) + + sub = subparsers.add_parser("fine_tunes.prepare_data") + sub.add_argument( + "-f", + "--file", + required=True, + help="JSONL, JSON, CSV, TSV, TXT or XLSX file containing prompt-completion examples to be analyzed." + "This should be the local file path.", + ) + sub.set_defaults(func=FineTune.prepare_data) + + +def api_register(parser): # Engine management subparsers = parser.add_subparsers(help="All API subcommands") @@ -544,6 +676,12 @@ def help(args): "be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345) " "or a local file path.", ) + sub.add_argument( + "--no_check_if_files_exist", + dest="check_if_files_exist", + action="store_false", + help="If this argument is set and training_file or validation_file are file paths, immediately upload them. If this argument is not set, check if they may be duplicates of already uploaded files before uploading, based on file name and file size.", + ) sub.add_argument( "-m", "--model", @@ -554,13 +692,84 @@ def help(args): action="store_true", help="If set, returns immediately after creating the job. Otherwise, waits for the job to complete.", ) - sub.add_argument("-p", "--hparams", help="Hyperparameter JSON") + sub.add_argument( + "--n_epochs", + type=int, + help="The number of epochs to train the model for. An epoch refers to one " + "full cycle through the training dataset.", + ) + sub.add_argument( + "--batch_size", + type=int, + help="The batch size to use for training. The batch size is the number of " + "training examples used to train a single forward and backward pass.", + ) + sub.add_argument( + "--learning_rate_multiplier", + type=float, + help="The learning rate multiplier to use for training. The fine-tuning " + "learning rate is determined by the original learning rate used for " + "pretraining multiplied by this value", + ) + sub.add_argument( + "--use_packing", + action="store_true", + dest="use_packing", + help="On classification tasks, we recommend not setting this flag. " + "On all other tasks, we recommend setting it. " + "When set, we pack as many prompt-completion pairs as possible into each " + "training example. This greatly increases the speed of a fine-tuning job, " + "often without negatively affecting model performance.", + ) + sub.add_argument( + "--no_packing", + action="store_false", + dest="use_packing", + help="Disables the packing flag (see --use_packing for description)", + ) + sub.set_defaults(use_packing=True) + sub.add_argument( + "--prompt_loss_weight", + type=float, + help="The weight to use for the prompt loss. The optimum value here depends " + "depends on your use case. This determines how much the model prioritizes " + "learning from prompt tokens vs learning from completion tokens", + ) + sub.add_argument( + "--compute_classification_metrics", + action="store_true", + help="If set, we calculate classification-specific metrics such as accuracy " + "and F-1 score using the validation set at the end of every epoch.", + ) + sub.add_argument( + "--classification_n_classes", + type=int, + help="The number of classes in a classification task. This parameter is " + "required for multiclass classification", + ) + sub.add_argument( + "--classification_positive_class", + help="The positive class in binary classification. This parameter is needed " + "to generate precision, recall and F-1 metrics when doing binary " + "classification", + ) + sub.add_argument( + "--classification_betas", + help="If this is provided, we calculate F-beta scores at the specified beta " + "values. The F-beta score is a generalization of F-1 score. This is only " + "used for binary classification. The expected format is a comma-separated " + "list - e.g. 1,1.5,2", + ) sub.set_defaults(func=FineTune.create) sub = subparsers.add_parser("fine_tunes.get") sub.add_argument("-i", "--id", required=True, help="The id of the fine-tune job") sub.set_defaults(func=FineTune.get) + sub = subparsers.add_parser("fine_tunes.results") + sub.add_argument("-i", "--id", required=True, help="The id of the fine-tune job") + sub.set_defaults(func=FineTune.results) + sub = subparsers.add_parser("fine_tunes.events") sub.add_argument("-i", "--id", required=True, help="The id of the fine-tune job") sub.add_argument( diff --git a/openai/validators.py b/openai/validators.py new file mode 100644 index 0000000000..ce34e2d76a --- /dev/null +++ b/openai/validators.py @@ -0,0 +1,679 @@ +import os +import sys +import pandas as pd + +from typing import NamedTuple, Optional, Callable, Any + + +class Remediation(NamedTuple): + name: str + immediate_msg: Optional[str] = None + necessary_msg: Optional[str] = None + necessary_fn: Optional[Callable[[Any], Any]] = None + optional_msg: Optional[str] = None + optional_fn: Optional[Callable[[Any], Any]] = None + error_msg: Optional[str] = None + + +def num_examples_validator(df): + """ + This validator will only print out the number of examples and recommend to the user to increase the number of examples if less than 100. + """ + MIN_EXAMPLES = 100 + optional_suggestion = ( + "" + if len(df) >= MIN_EXAMPLES + else ". In general, we recommend having at least a few hundred examples. We've found that performance tends to linearly increase for every doubling of the number of examples" + ) + immediate_msg = ( + f"\n- Your file contains {len(df)} prompt-completion pairs{optional_suggestion}" + ) + return Remediation(name="num_examples", immediate_msg=immediate_msg) + + +def necessary_column_validator(df, necessary_column): + """ + This validator will ensure that the necessary column is present in the dataframe. + """ + + def lower_case_column(df, column): + cols = [c for c in df.columns if c.lower() == column] + df.rename(columns={cols[0]: column.lower()}, inplace=True) + return df + + immediate_msg = None + necessary_fn = None + necessary_msg = None + error_msg = None + + if necessary_column not in df.columns: + if necessary_column in [c.lower() for c in df.columns]: + + def lower_case_column_creator(df): + return lower_case_column(df, necessary_column) + + necessary_fn = lower_case_column_creator + immediate_msg = ( + f"\n- The `{necessary_column}` column/key should be lowercase" + ) + necessary_msg = f"Lower case column name to `{necessary_column}`" + else: + error_msg = f"`{necessary_column}` column/key is missing. Please make sure you name your columns/keys appropriately, then retry" + + return Remediation( + name="necessary_column", + immediate_msg=immediate_msg, + necessary_msg=necessary_msg, + necessary_fn=necessary_fn, + error_msg=error_msg, + ) + + +def additional_column_validator(df): + """ + This validator will remove additional columns from the dataframe. + """ + additional_columns = [] + necessary_msg = None + immediate_msg = None + necessary_fn = None + if len(df.columns) > 2: + additional_columns = [ + c for c in df.columns if c not in ["prompt", "completion"] + ] + warn_message = "" + for ac in additional_columns: + dups = [c for c in additional_columns if ac in c] + if len(dups) > 0: + warn_message += f"\n WARNING: Some of the additional columns/keys contain `{ac}` in their name. These will be ignored, and the column/key `{ac}` will be used instead. This could also result from a duplicate column/key in the provided file." + immediate_msg = f"\n- The input file should contain exactly two columns/keys per row. Additional columns/keys present are: {additional_columns}{warn_message}" + necessary_msg = f"Remove additional columns/keys: {additional_columns}" + + def necessary_fn(x): + return x[["prompt", "completion"]] + + return Remediation( + name="additional_column", + immediate_msg=immediate_msg, + necessary_msg=necessary_msg, + necessary_fn=necessary_fn, + ) + + +def non_empty_completion_validator(df): + """ + This validator will ensure that no completion is empty. + """ + necessary_msg = None + necessary_fn = None + immediate_msg = None + + if ( + df["completion"].apply(lambda x: x == "").any() + or df["completion"].isnull().any() + ): + empty_rows = (df["completion"] == "") | (df["completion"].isnull()) + empty_indexes = df.reset_index().index[empty_rows].tolist() + immediate_msg = f"\n- `completion` column/key should not contain empty strings. These are rows: {empty_indexes}" + + def necessary_fn(x): + return x[x["completion"] != ""].dropna(subset=["completion"]) + + necessary_msg = f"Remove {len(empty_indexes)} rows with empty completions" + return Remediation( + name="empty_completion", + immediate_msg=immediate_msg, + necessary_msg=necessary_msg, + necessary_fn=necessary_fn, + ) + + +def duplicated_rows_validator(df): + """ + This validator will suggest to the user to remove duplicate rows if they exist. + """ + duplicated_rows = df.duplicated(subset=["prompt", "completion"]) + duplicated_indexes = df.reset_index().index[duplicated_rows].tolist() + immediate_msg = None + optional_msg = None + optional_fn = None + + if len(duplicated_indexes) > 0: + immediate_msg = f"\n- There are {len(duplicated_indexes)} duplicated prompt-completion pairs. These are rows: {duplicated_indexes}" + optional_msg = f"Remove {len(duplicated_indexes)} duplicate rows" + + def optional_fn(x): + return x.drop_duplicates(subset=["prompt", "completion"]) + + return Remediation( + name="duplicated_rows", + immediate_msg=immediate_msg, + optional_msg=optional_msg, + optional_fn=optional_fn, + ) + + +def common_prompt_suffix_validator(df): + """ + This validator will suggest to add a common suffix to the prompt if one doesn't already exist in case of classification or conditional generation. + """ + error_msg = None + immediate_msg = None + optional_msg = None + optional_fn = None + + # Find a suffix which is not contained within the prompt otherwise + suggested_suffix = "\n\n### =>\n\n" + suffix_options = [ + " ->", + "\n\n###\n\n", + "\n\n===\n\n", + "\n\n---\n\n", + "\n\n===>\n\n", + "\n\n--->\n\n", + ] + for suffix_option in suffix_options: + if suffix_option == " ->": + if df.prompt.str.contains("\n").any(): + continue + if df.prompt.str.contains(suffix_option).any(): + continue + suggested_suffix = suffix_option + break + display_suggested_suffix = suggested_suffix.replace("\n", "\\n") + + ft_type = infer_task_type(df) + if ft_type == "open-ended generation": + return Remediation(name="common_suffix") + + def add_suffix(x, suffix): + x["prompt"] += suffix + return x + + common_suffix = get_common_xfix(df.prompt, xfix="suffix") + if (df.prompt == common_suffix).all(): + error_msg = f"All prompts are identical: `{common_suffix}`\nConsider leaving the prompts blank if you want to do open-ended generation, otherwise ensure prompts are different" + return Remediation(name="common_suffix", error_msg=error_msg) + + if common_suffix != "": + common_suffix_new_line_handled = common_suffix.replace("\n", "\\n") + immediate_msg = ( + f"\n- All prompts end with suffix `{common_suffix_new_line_handled}`" + ) + if len(common_suffix) > 10: + immediate_msg += f". This suffix seems very long. Consider replacing with a shorter suffix, such as `{display_suggested_suffix}`" + if df.prompt.str[: -len(common_suffix)].str.contains(common_suffix).any(): + immediate_msg += f"\n WARNING: Some of your prompts contain the suffix `{common_suffix}` more than once. We strongly suggest that you review your prompts and add a unique suffix" + + else: + immediate_msg = "\n- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See `Fine Tuning How to Guide` for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts empty" + + if common_suffix == "": + optional_msg = ( + f"Add a suffix separator `{display_suggested_suffix}` to all prompts" + ) + + def optional_fn(x): + return add_suffix(x, suggested_suffix) + + return Remediation( + name="common_completion_suffix", + immediate_msg=immediate_msg, + optional_msg=optional_msg, + optional_fn=optional_fn, + error_msg=error_msg, + ) + + +def common_prompt_prefix_validator(df): + """ + This validator will suggest to remove a common prefix from the prompt if a long one exist. + """ + MAX_PREFIX_LEN = 12 + + immediate_msg = None + optional_msg = None + optional_fn = None + + common_prefix = get_common_xfix(df.prompt, xfix="prefix") + if common_prefix == "": + return Remediation(name="common_prefix") + + def remove_common_prefix(x, prefix): + x["prompt"] = x["prompt"].str[len(prefix) :] + return x + + if (df.prompt == common_prefix).all(): + # already handled by common_suffix_validator + return Remediation(name="common_prefix") + + if common_prefix != "": + immediate_msg = f"\n- All prompts start with prefix `{common_prefix}`" + if MAX_PREFIX_LEN < len(common_prefix): + immediate_msg += ". Fine-tuning doesn't require the instruction specifying the task, or a few-shot example scenario. Most of the time you should only add the input data into the prompt, and the desired output into the completion" + optional_msg = f"Remove prefix `{common_prefix}` from all prompts" + + def optional_fn(x): + return remove_common_prefix(x, common_prefix) + + return Remediation( + name="common_prompt_prefix", + immediate_msg=immediate_msg, + optional_msg=optional_msg, + optional_fn=optional_fn, + ) + + +def common_completion_prefix_validator(df): + """ + This validator will suggest to remove a common prefix from the completion if a long one exist. + """ + MAX_PREFIX_LEN = 5 + + common_prefix = get_common_xfix(df.completion, xfix="prefix") + if len(common_prefix) < MAX_PREFIX_LEN: + return Remediation(name="common_prefix") + + def remove_common_prefix(x, prefix): + x["completion"] = x["completion"].str[len(prefix) :] + return x + + if (df.completion == common_prefix).all(): + # already handled by common_suffix_validator + return Remediation(name="common_prefix") + + immediate_msg = f"\n- All completions start with prefix `{common_prefix}`. Most of the time you should only add the output data into the completion, without any prefix" + optional_msg = f"Remove prefix `{common_prefix}` from all completions" + + def optional_fn(x): + return remove_common_prefix(x, common_prefix) + + return Remediation( + name="common_completion_prefix", + immediate_msg=immediate_msg, + optional_msg=optional_msg, + optional_fn=optional_fn, + ) + + +def common_completion_suffix_validator(df): + """ + This validator will suggest to add a common suffix to the completion if one doesn't already exist in case of classification or conditional generation. + """ + error_msg = None + immediate_msg = None + optional_msg = None + optional_fn = None + + # Find a suffix which is not contained within the completion otherwise + suggested_suffix = " [END]" + suffix_options = [ + "\n", + ".", + " END", + "***", + "+++", + "&&&", + "$$$", + "@@@", + "%%%", + ] + for suffix_option in suffix_options: + if df.completion.str.contains(suffix_option).any(): + continue + suggested_suffix = suffix_option + break + display_suggested_suffix = suggested_suffix.replace("\n", "\\n") + + ft_type = infer_task_type(df) + if ft_type == "open-ended generation": + return Remediation(name="common_suffix") + + def add_suffix(x, suffix): + x["completion"] += suffix + return x + + common_suffix = get_common_xfix(df.completion, xfix="suffix") + if (df.completion == common_suffix).all(): + error_msg = f"All completions are identical: `{common_suffix}`\nEnsure completions are different, otherwise the model will just repeat `{common_suffix}`" + return Remediation(name="common_suffix", error_msg=error_msg) + + if common_suffix != "": + common_suffix_new_line_handled = common_suffix.replace("\n", "\\n") + immediate_msg = ( + f"\n- All completions end with suffix `{common_suffix_new_line_handled}`" + ) + if len(common_suffix) > 10: + immediate_msg += f". This suffix seems very long. Consider replacing with a shorter suffix, such as `{display_suggested_suffix}`" + if df.completion.str[: -len(common_suffix)].str.contains(common_suffix).any(): + immediate_msg += f"\n WARNING: Some of your completions contain the suffix `{common_suffix}` more than once. We suggest that you review your completions and add a unique ending" + + else: + immediate_msg = "\n- Your data does not contain a common ending at the end of your completions. Having a common ending string appended to the end of the completion makes it clearer to the fine-tuned model where the completion should end. See `Fine Tuning How to Guide` for more detail and examples." + + if common_suffix == "": + optional_msg = ( + f"Add a suffix ending `{display_suggested_suffix}` to all completions" + ) + + def optional_fn(x): + return add_suffix(x, suggested_suffix) + + return Remediation( + name="common_completion_suffix", + immediate_msg=immediate_msg, + optional_msg=optional_msg, + optional_fn=optional_fn, + error_msg=error_msg, + ) + + +def completions_space_start_validator(df): + """ + This validator will suggest to add a space at the start of the completion if it doesn't already exist. This helps with tokenization. + """ + + def add_space_start(x): + x["completion"] = x["completion"].apply( + lambda x: ("" if x[0] == " " else " ") + x + ) + return x + + optional_msg = None + optional_fn = None + immediate_msg = None + + if df.completion.str[:1].nunique() != 1 or df.completion.values[0][0] != " ": + immediate_msg = "\n- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See `Fine Tuning How to Guide` for more details" + optional_msg = "Add a whitespace character to the beginning of the completion" + optional_fn = add_space_start + return Remediation( + name="completion_space_start", + immediate_msg=immediate_msg, + optional_msg=optional_msg, + optional_fn=optional_fn, + ) + + +def lower_case_validator(df, column): + """ + This validator will suggest to lowercase the column values, if more than a third of letters are uppercase. + """ + + def lower_case(x): + x[column] = x[column].str.lower() + return x + + count_upper = ( + df[column] + .apply(lambda x: sum(1 for c in x if c.isalpha() and c.isupper())) + .sum() + ) + count_lower = ( + df[column] + .apply(lambda x: sum(1 for c in x if c.isalpha() and c.islower())) + .sum() + ) + + if count_upper * 2 > count_lower: + return Remediation( + name="lower_case", + immediate_msg=f"\n- More than a third of your `{column}` column/key is uppercase. Uppercase {column}s tends to perform worse than a mixture of case encountered in normal language. We recommend to lower case the data if that makes sense in your domain. See `Fine Tuning How to Guide` for more details", + optional_msg=f"Lowercase all your data in column/key `{column}`", + optional_fn=lower_case, + ) + + +def read_any_format(fname): + """ + This function will read a file saved in .csv, .json, .txt, .xlsx or .tsv format using pandas. + - for .xlsx it will read the first sheet + - for .txt it will assume completions and split on newline + """ + remediation = None + necessary_msg = None + immediate_msg = None + error_msg = None + df = None + + if os.path.isfile(fname): + for ending, separator in [(".csv", ","), (".tsv", "\t")]: + if fname.lower().endswith(ending): + immediate_msg = f"\n- Based on your file extension, your file is formatted as a {ending[1:].upper()} file" + necessary_msg = ( + f"Your format `{ending[1:].upper()}` will be converted to `JSONL`" + ) + df = pd.read_csv(fname, sep=separator, dtype=str) + if fname.lower().endswith(".xlsx"): + immediate_msg = "\n- Based on your file extension, your file is formatted as an Excel file" + necessary_msg = "Your format `XLSX` will be converted to `JSONL`" + xls = pd.ExcelFile(fname) + sheets = xls.sheet_names + if len(sheets) > 1: + immediate_msg += "\n- Your Excel file contains more than one sheet. Please either save as csv or ensure all data is present in the first sheet. WARNING: Reading only the first sheet..." + df = pd.read_excel(fname, dtype=str) + if fname.lower().endswith(".txt"): + immediate_msg = "\n- Based on your file extension, you provided a text file" + necessary_msg = "Your format `TXT` will be converted to `JSONL`" + with open(fname, "r") as f: + content = f.read() + df = pd.DataFrame( + [["", line] for line in content.split("\n")], + columns=["prompt", "completion"], + dtype=str, + ) + if fname.lower().endswith("jsonl") or fname.lower().endswith("json"): + try: + df = pd.read_json(fname, lines=True, dtype=str) + except (ValueError, TypeError): + df = pd.read_json(fname, dtype=str) + immediate_msg = "\n- Your file appears to be in a .JSON format. Your file will be converted to JSONL format" + necessary_msg = "Your format `JSON` will be converted to `JSONL`" + + if df is None: + error_msg = ( + "Your file is not saved as a .CSV, .TSV, .XLSX, .TXT or .JSONL file." + ) + if "." in fname: + error_msg += ( + f" Your file `{fname}` appears to end with `.{fname.split('.')[1]}`" + ) + else: + error_msg += f" Your file `{fname}` does not appear to have a file ending. Please ensure your filename ends with one of the supported file endings." + else: + error_msg = f"File {fname} does not exist." + + remediation = Remediation( + name="read_any_format", + necessary_msg=necessary_msg, + immediate_msg=immediate_msg, + error_msg=error_msg, + ) + return df, remediation + + +def format_inferrer_validator(df): + """ + This validator will infer the likely fine-tuning format of the data, and display it to the user if it is classification. + It will also suggest to use ada, --no_packing and explain train/validation split benefits. + """ + ft_type = infer_task_type(df) + immediate_msg = None + if ft_type == "classification": + immediate_msg = f"\n- Based on your data it seems like you're trying to fine-tune a model for {ft_type}\n- For classification, we recommend you try one of the faster and cheaper models, such as `ada`. You should also set the `--no_packing` parameter when fine-tuning\n- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training" + return Remediation(name="num_examples", immediate_msg=immediate_msg) + + +def apply_necessary_remediation(df, remediation): + """ + This function will apply a necessary remediation to a dataframe, or print an error message if one exists. + """ + if remediation.error_msg is not None: + sys.stderr.write( + f"\n\nERROR in {remediation.name} validator: {remediation.error_msg}\n\nAborting..." + ) + sys.exit(1) + if remediation.immediate_msg is not None: + sys.stdout.write(remediation.immediate_msg) + if remediation.necessary_fn is not None: + df = remediation.necessary_fn(df) + return df + + +def apply_optional_remediation(df, remediation): + """ + This function will apply an optional remediation to a dataframe, based on the user input. + """ + if remediation.optional_msg is not None: + if input(f"- [Recommended] {remediation.optional_msg} [Y/n]: ").lower() != "n": + df = remediation.optional_fn(df) + if remediation.necessary_msg is not None: + sys.stdout.write(f"- [Necessary] {remediation.necessary_msg}\n") + return df + + +def write_out_file(df, fname, any_remediations): + """ + This function will write out a dataframe to a file, if the user would like to proceed, and also offer a fine-tuning command with the newly created file. + For classification it will optionally ask the user if they would like to split the data into train/valid files, and modify the suggested command to include the valid set. + """ + ft_format = infer_task_type(df) + common_prompt_suffix = get_common_xfix(df.prompt, xfix="suffix") + common_completion_suffix = get_common_xfix(df.completion, xfix="suffix") + + split = False + if ft_format == "classification": + if ( + input( + "- [Recommended] Would you like to split into training and validation set? [Y/n]: " + ) + != "n" + ): + split = True + + packing_param = " --no_packing" if ft_format == "classification" else "" + common_prompt_suffix_new_line_handled = common_prompt_suffix.replace("\n", "\\n") + common_completion_suffix_new_line_handled = common_completion_suffix.replace( + "\n", "\\n" + ) + optional_ending_string = ( + f' Make sure to include `stop=["{common_completion_suffix_new_line_handled}"]` so that the generated texts ends at the expected place.' + if len(common_completion_suffix_new_line_handled) > 0 + else "" + ) + + if not any_remediations: + sys.stdout.write( + f'\nYou can use your file for fine-tuning:\n> openai api fine_tunes.create -t "{fname}"{packing_param}\n\nAfter you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `{common_prompt_suffix_new_line_handled}` for the model to start generating completions, rather than continuing with the prompt.{optional_ending_string}\n' + ) + + elif ( + input( + "\n\nYour data will be written to a new JSONL file. Proceed [Y/n]: " + ).lower() + != "n" + ): + + suffixes = ["_train", "_valid"] if split else [""] + outfnames = [] + indices = None + for suffix in suffixes: + out_fname = fname.split(".")[0] + "_prepared" + suffix + ".jsonl" + + # check if file already exists, and if it does, add a number to the end + i = 0 + while True: + to_continue = False + # in case of train and test, make sure that the numbers will match + for suf in suffixes: + out_fname = ( + fname.split(".")[0] + "_prepared" + suf + f" ({i})" + ".jsonl" + ) + if i == 0: + out_fname = fname.split(".")[0] + "_prepared" + suf + ".jsonl" + i += 1 + if os.path.isfile(out_fname): + to_continue = True + if to_continue: + continue + break + + outfnames.append(out_fname) + if suffix == "_train": + MAX_VALID_EXAMPLES = 1000 + n = max(len(df) - MAX_VALID_EXAMPLES, int(len(df) * 0.8)) + df_out = df.sample(n=n, random_state=42) + indices = df_out.index + if suffix == "_valid": + df_out = df.drop(indices) + if suffix == "": + df_out = df + df_out[["prompt", "completion"]].to_json( + out_fname, lines=True, orient="records" + ) + + # Add -v VALID_FILE if we split the file into train / valid + files_string = ("s" if split else "") + " to `" + ("` and `".join(outfnames)) + valid_string = f' -v "{outfnames[1]}"' if split else "" + sys.stdout.write( + f'\nWrote modified file{files_string}`\nFeel free to take a look!\n\nNow use that file when fine-tuning:\n> openai api fine_tunes.create -t "{outfnames[0]}"{valid_string}{packing_param}\n\nAfter you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `{common_prompt_suffix_new_line_handled}` for the model to start generating completions, rather than continuing with the prompt.{optional_ending_string}\n' + ) + else: + sys.stdout.write("Aborting... did not write the file\n") + + +def infer_task_type(df): + """ + Infer the likely fine-tuning task type from the data + """ + CLASSIFICATION_THRESHOLD = 3 # min_average instances of each class + if sum(df.prompt.str.len()) == 0: + return "open-ended generation" + + if len(df.completion.unique()) < len(df) / CLASSIFICATION_THRESHOLD: + return "classification" + + return "conditional generation" + + +def get_common_xfix(series, xfix="suffix"): + """ + Finds the longest common suffix or prefix of all the values in a series + """ + common_xfix = "" + while True: + common_xfixes = ( + series.str[-(len(common_xfix) + 1) :] + if xfix == "suffix" + else series.str[: len(common_xfix) + 1] + ) # first few or last few characters + if ( + common_xfixes.nunique() != 1 + ): # we found the character at which we don't have a unique xfix anymore + break + elif ( + common_xfix == common_xfixes.values[0] + ): # the entire first row is a prefix of every other row + break + else: # the first or last few characters are still common across all rows - let's try to add one more + common_xfix = common_xfixes.values[0] + return common_xfix + + +def get_validators(): + return [ + num_examples_validator, + lambda x: necessary_column_validator(x, "prompt"), + lambda x: necessary_column_validator(x, "completion"), + additional_column_validator, + non_empty_completion_validator, + format_inferrer_validator, + duplicated_rows_validator, + lambda x: lower_case_validator(x, "prompt"), + lambda x: lower_case_validator(x, "completion"), + common_prompt_suffix_validator, + common_prompt_prefix_validator, + common_completion_prefix_validator, + common_completion_suffix_validator, + completions_space_start_validator, + ] diff --git a/openai/version.py b/openai/version.py index 44f4ad01d1..976684abd7 100644 --- a/openai/version.py +++ b/openai/version.py @@ -1 +1 @@ -VERSION = "0.8.0" +VERSION = "0.9.0" diff --git a/setup.py b/setup.py index dce57104e7..21bb3fcbb2 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,9 @@ install_requires=[ "requests>=2.20", # to get the patch for CVE-2018-18074 "tqdm", # Needed for progress bars + "pandas>=1.2.3", # Needed for CLI fine-tuning data preparation tool + "pandas-stubs>=1.1.0.11", # Needed for type hints for mypy + "openpyxl>=3.0.7", # Needed for CLI fine-tuning data preparation tool xlsx format ], extras_require={"dev": ["black==20.8b1", "pytest==6.*"]}, python_requires=">=3.6",