Skip to content

Lots of CLI changes #22

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 29, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions bin/openai
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
#!/usr/bin/env python
import argparse
import json
import logging
import os
import sys

import openai
from openai.cli import display_error
from openai.cli import register as api_register
from openai.cli import api_register, tools_register

logger = logging.getLogger()
formatter = logging.Formatter("[%(asctime)s] %(message)s")
@@ -40,9 +38,11 @@ def main():
parser.set_defaults(func=help)

subparsers = parser.add_subparsers()
sub = subparsers.add_parser("api", help="Direct API calls")
sub_api = subparsers.add_parser("api", help="Direct API calls")
sub_tools = subparsers.add_parser("tools", help="Client side tools for convenience")

api_register(sub)
api_register(sub_api)
tools_register(sub_tools)

args = parser.parse_args()
if args.verbosity == 1:
2 changes: 1 addition & 1 deletion openai/api_resources/completion.py
Original file line number Diff line number Diff line change
@@ -19,7 +19,7 @@ def create(cls, *args, **kwargs):
of valid parameters.
"""
start = time.time()
timeout = kwargs.get("timeout", None)
timeout = kwargs.pop("timeout", None)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rachellim What's the reason for this change? This is preventing the timeout parameter to be passed on to super().create(...), meaning that API users have no way to specify a timeout.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @feroldi! In trying to avoid a backwards incompatible change, we added a new param called request_timeout so that users could set a timeout that didn't interfere with the existing timeout functionality. It's documented here: https://github.com/openai/openai-python#params

Does that help?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes! That makes sense. Thanks.

if kwargs.get("model", None) is None and kwargs.get("engine", None) is None:
raise InvalidRequestError(
"Must provide an 'engine' or 'model' parameter to create a Completion.",
59 changes: 59 additions & 0 deletions openai/api_resources/file.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from __future__ import absolute_import, division, print_function

import json
import os

import openai
from openai import api_requestor, util
from openai.api_resources.abstract import (
@@ -29,3 +32,59 @@ def create(
return util.convert_to_openai_object(
response, api_key, api_version, organization
)

@classmethod
def download(
cls, id, api_key=None, api_base=None, api_version=None, organization=None
):
requestor = api_requestor.APIRequestor(
api_key,
api_base=api_base or openai.file_api_base or openai.api_base,
api_version=api_version,
organization=organization,
)
url = f"{cls.class_url()}/{id}/content"
rbody, rcode, rheaders, _, _ = requestor.request_raw("get", url)
if not 200 <= rcode < 300:
raise requestor.handle_error_response(
rbody, rcode, json.loads(rbody), rheaders, stream_error=False
)
return rbody

@classmethod
def find_matching_files(
cls,
api_key=None,
api_base=None,
api_version=None,
organization=None,
file=None,
purpose=None,
):
if file is None:
raise openai.error.InvalidRequestError(
"'file' is a required property", "file"
)
if purpose is None:
raise openai.error.InvalidRequestError(
"'purpose' is a required property", "purpose"
)
all_files = cls.list(
api_key=api_key,
api_base=api_base or openai.file_api_base or openai.api_base,
api_version=api_version,
organization=organization,
).get("data", [])
matching_files = []
for f in all_files:
if f["purpose"] != purpose:
continue
if not hasattr(file, "name") or f["filename"] != file.name:
continue
file.seek(0, os.SEEK_END)
if f["bytes"] != file.tell():
file.seek(0)
continue
file.seek(0)
matching_files.append(f)
return matching_files
241 changes: 225 additions & 16 deletions openai/cli.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
import datetime
import json
import os
import signal
import sys
import warnings
from openai.validators import (
write_out_file,
apply_necessary_remediation,
apply_optional_remediation,
read_any_format,
get_validators,
)

import openai

@@ -221,12 +227,48 @@ def list(cls, args):
print(resp)

@classmethod
def _get_or_upload(cls, file):
def _get_or_upload(cls, file, check_if_file_exists=True):
try:
openai.File.retrieve(file)
except openai.error.InvalidRequestError as e:
if e.http_status == 404 and os.path.isfile(file):
resp = openai.File.create(file=open(file), purpose="fine-tune")
matching_files = openai.File.find_matching_files(
file=open(file), purpose="fine-tune"
)
if len(matching_files) > 0 and check_if_file_exists:
file_ids = [f["id"] for f in matching_files]
sys.stdout.write(
"Found potentially duplicated files with name '{name}', purpose 'fine-tune' and size {size} bytes\n".format(
name=matching_files[0]["filename"],
size=matching_files[0]["bytes"],
)
)
sys.stdout.write("\n".join(file_ids))
while True:
sys.stdout.write(
"\nEnter file ID to reuse an already uploaded file, or an empty string to upload this file anyway: "
)
inp = sys.stdin.readline().strip()
if inp in file_ids:
sys.stdout.write(
"Using your file {file}: {id}\n".format(
file=file, id=inp
)
)
return inp
elif inp == "":
break
else:
sys.stdout.write(
"File id '{id}' is not among the IDs of the potentially duplicated files\n".format(
id=inp
)
)

resp = openai.File.create(
file=open(file),
purpose="fine-tune",
)
sys.stdout.write(
"Uploaded file from {file}: {id}\n".format(file=file, id=resp["id"])
)
@@ -236,21 +278,38 @@ def _get_or_upload(cls, file):
@classmethod
def create(cls, args):
create_args = {
"training_file": cls._get_or_upload(args.training_file),
"training_file": cls._get_or_upload(
args.training_file, args.check_if_files_exist
),
}
if args.validation_file:
create_args["validation_file"] = cls._get_or_upload(args.validation_file)
create_args["validation_file"] = cls._get_or_upload(
args.validation_file, args.check_if_files_exist
)
if args.model:
create_args["model"] = args.model
if args.hparams:
try:
hparams = json.loads(args.hparams)
except json.decoder.JSONDecodeError:
sys.stderr.write(
"--hparams must be JSON decodable and match the hyperparameter arguments of the API"
)
sys.exit(1)
create_args.update(hparams)
if args.n_epochs:
create_args["n_epochs"] = args.n_epochs
if args.batch_size:
create_args["batch_size"] = args.batch_size
if args.learning_rate_multiplier:
create_args["learning_rate_multiplier"] = args.learning_rate_multiplier
create_args["use_packing"] = args.use_packing
if args.prompt_loss_weight:
create_args["prompt_loss_weight"] = args.prompt_loss_weight
if args.compute_classification_metrics:
create_args[
"compute_classification_metrics"
] = args.compute_classification_metrics
if args.classification_n_classes:
create_args["classification_n_classes"] = args.classification_n_classes
if args.classification_positive_class:
create_args[
"classification_positive_class"
] = args.classification_positive_class
if args.classification_betas:
betas = [float(x) for x in args.classification_betas.split(",")]
create_args["classification_betas"] = betas

resp = openai.FineTune.create(**create_args)

@@ -271,6 +330,18 @@ def create(cls, args):
def get(cls, args):
resp = openai.FineTune.retrieve(id=args.id)
print(resp)
print(resp["result_files"][0])

@classmethod
def results(cls, args):
fine_tune = openai.FineTune.retrieve(id=args.id)
if "result_files" not in fine_tune or len(fine_tune["result_files"]) == 0:
raise openai.error.InvalidRequestError(
f"No results file available for fine-tune {args.id}", "id"
)
result_file = openai.FineTune.retrieve(id=args.id)["result_files"][0]
resp = openai.File.download(id=result_file["id"])
print(resp.decode("utf-8"))

@classmethod
def events(cls, args):
@@ -329,8 +400,69 @@ def cancel(cls, args):
resp = openai.FineTune.cancel(id=args.id)
print(resp)

@classmethod
def prepare_data(cls, args):

sys.stdout.write("Analyzing...\n")
fname = args.file
df, remediation = read_any_format(fname)
apply_necessary_remediation(None, remediation)

validators = get_validators()

optional_remediations = []
if remediation is not None:
optional_remediations.append(remediation)
for validator in validators:
remediation = validator(df)
if remediation is not None:
optional_remediations.append(remediation)
df = apply_necessary_remediation(df, remediation)

any_optional_or_necessary_remediations = any(
[
remediation
for remediation in optional_remediations
if remediation.optional_msg is not None
or remediation.necessary_msg is not None
]
)

if any_optional_or_necessary_remediations:
sys.stdout.write(
"\n\nBased on the analysis we will perform the following actions:\n"
)

for remediation in optional_remediations:
df = apply_optional_remediation(df, remediation)
else:
sys.stdout.write("\n\nNo remediations found.\n")

write_out_file(df, fname, any_optional_or_necessary_remediations)


def register(parser):
def tools_register(parser):
subparsers = parser.add_subparsers(
title="Tools", help="Convenience client side tools"
)

def help(args):
parser.print_help()

parser.set_defaults(func=help)

sub = subparsers.add_parser("fine_tunes.prepare_data")
sub.add_argument(
"-f",
"--file",
required=True,
help="JSONL, JSON, CSV, TSV, TXT or XLSX file containing prompt-completion examples to be analyzed."
"This should be the local file path.",
)
sub.set_defaults(func=FineTune.prepare_data)


def api_register(parser):
# Engine management
subparsers = parser.add_subparsers(help="All API subcommands")

@@ -544,6 +676,12 @@ def help(args):
"be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345) "
"or a local file path.",
)
sub.add_argument(
"--no_check_if_files_exist",
dest="check_if_files_exist",
action="store_false",
help="If this argument is set and training_file or validation_file are file paths, immediately upload them. If this argument is not set, check if they may be duplicates of already uploaded files before uploading, based on file name and file size.",
)
sub.add_argument(
"-m",
"--model",
@@ -554,13 +692,84 @@ def help(args):
action="store_true",
help="If set, returns immediately after creating the job. Otherwise, waits for the job to complete.",
)
sub.add_argument("-p", "--hparams", help="Hyperparameter JSON")
sub.add_argument(
"--n_epochs",
type=int,
help="The number of epochs to train the model for. An epoch refers to one "
"full cycle through the training dataset.",
)
sub.add_argument(
"--batch_size",
type=int,
help="The batch size to use for training. The batch size is the number of "
"training examples used to train a single forward and backward pass.",
)
sub.add_argument(
"--learning_rate_multiplier",
type=float,
help="The learning rate multiplier to use for training. The fine-tuning "
"learning rate is determined by the original learning rate used for "
"pretraining multiplied by this value",
)
sub.add_argument(
"--use_packing",
action="store_true",
dest="use_packing",
help="On classification tasks, we recommend not setting this flag. "
"On all other tasks, we recommend setting it. "
"When set, we pack as many prompt-completion pairs as possible into each "
"training example. This greatly increases the speed of a fine-tuning job, "
"often without negatively affecting model performance.",
)
sub.add_argument(
"--no_packing",
action="store_false",
dest="use_packing",
help="Disables the packing flag (see --use_packing for description)",
)
sub.set_defaults(use_packing=True)
sub.add_argument(
"--prompt_loss_weight",
type=float,
help="The weight to use for the prompt loss. The optimum value here depends "
"depends on your use case. This determines how much the model prioritizes "
"learning from prompt tokens vs learning from completion tokens",
)
sub.add_argument(
"--compute_classification_metrics",
action="store_true",
help="If set, we calculate classification-specific metrics such as accuracy "
"and F-1 score using the validation set at the end of every epoch.",
)
sub.add_argument(
"--classification_n_classes",
type=int,
help="The number of classes in a classification task. This parameter is "
"required for multiclass classification",
)
sub.add_argument(
"--classification_positive_class",
help="The positive class in binary classification. This parameter is needed "
"to generate precision, recall and F-1 metrics when doing binary "
"classification",
)
sub.add_argument(
"--classification_betas",
help="If this is provided, we calculate F-beta scores at the specified beta "
"values. The F-beta score is a generalization of F-1 score. This is only "
"used for binary classification. The expected format is a comma-separated "
"list - e.g. 1,1.5,2",
)
sub.set_defaults(func=FineTune.create)

sub = subparsers.add_parser("fine_tunes.get")
sub.add_argument("-i", "--id", required=True, help="The id of the fine-tune job")
sub.set_defaults(func=FineTune.get)

sub = subparsers.add_parser("fine_tunes.results")
sub.add_argument("-i", "--id", required=True, help="The id of the fine-tune job")
sub.set_defaults(func=FineTune.results)

sub = subparsers.add_parser("fine_tunes.events")
sub.add_argument("-i", "--id", required=True, help="The id of the fine-tune job")
sub.add_argument(
679 changes: 679 additions & 0 deletions openai/validators.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion openai/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
VERSION = "0.8.0"
VERSION = "0.9.0"
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -16,6 +16,9 @@
install_requires=[
"requests>=2.20", # to get the patch for CVE-2018-18074
"tqdm", # Needed for progress bars
"pandas>=1.2.3", # Needed for CLI fine-tuning data preparation tool
"pandas-stubs>=1.1.0.11", # Needed for type hints for mypy
"openpyxl>=3.0.7", # Needed for CLI fine-tuning data preparation tool xlsx format
],
extras_require={"dev": ["black==20.8b1", "pytest==6.*"]},
python_requires=">=3.6",