Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 7ddcba1

Browse files
rachellimtodor-markovBorisPowerdavecummings
authoredJun 29, 2021
Lots of CLI changes (#22)
* Add CLI option to download files (#34) * Option to check if file has been uploaded in the past before uploading (#33) The check is done based on filename, file purpose and file size * Add fine-tuning hparams directly into the fine-tunes CLI (#35) * update fine_tunes cli use_packing argument (#38) * A file verification and remediation tool. It applies the following validations: - prints the number of examples, and warns if it's lower than 100 - ensures prompt and completion columns are present - optionally removes any additional columns - ensures all completions are non-empty - infers which type of fine-tuning the data is most likely in (classification, conditional generation and open-ended generation) - optionally removes duplicate rows - infers the existence of a common suffix, and if there is none, suggests one for classification and conditional generation - optionally prepends a space to each completion, to make tokenization better - optionally splits into training and validation set for the classification use case - optionally ensures there's an ending string for all completions - optionally lowercases completions or prompts if more than a 1/3 of alphanumeric characters are upper case It interactively asks the user to accept or reject recommendations. If the user is happy, then it saves the modified output file as a jsonl, which is ready for being used in fine-tuning with the printed command. * Completion: remove from kwargs before passing to EngineAPI (#37) * Version bump before pushing to external Co-authored-by: Todor Markov <todor.m.markov@gmail.com> Co-authored-by: Boris Power <81998504+BorisPower@users.noreply.github.com> Co-authored-by: Dave Cummings <dave@openai.com>
1 parent 250c33d commit 7ddcba1

File tree

7 files changed

+973
-23
lines changed

7 files changed

+973
-23
lines changed
 

‎bin/openai

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
#!/usr/bin/env python
22
import argparse
3-
import json
43
import logging
5-
import os
64
import sys
75

86
import openai
97
from openai.cli import display_error
10-
from openai.cli import register as api_register
8+
from openai.cli import api_register, tools_register
119

1210
logger = logging.getLogger()
1311
formatter = logging.Formatter("[%(asctime)s] %(message)s")
@@ -40,9 +38,11 @@ def main():
4038
parser.set_defaults(func=help)
4139

4240
subparsers = parser.add_subparsers()
43-
sub = subparsers.add_parser("api", help="Direct API calls")
41+
sub_api = subparsers.add_parser("api", help="Direct API calls")
42+
sub_tools = subparsers.add_parser("tools", help="Client side tools for convenience")
4443

45-
api_register(sub)
44+
api_register(sub_api)
45+
tools_register(sub_tools)
4646

4747
args = parser.parse_args()
4848
if args.verbosity == 1:

‎openai/api_resources/completion.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def create(cls, *args, **kwargs):
1919
of valid parameters.
2020
"""
2121
start = time.time()
22-
timeout = kwargs.get("timeout", None)
22+
timeout = kwargs.pop("timeout", None)
2323
if kwargs.get("model", None) is None and kwargs.get("engine", None) is None:
2424
raise InvalidRequestError(
2525
"Must provide an 'engine' or 'model' parameter to create a Completion.",

‎openai/api_resources/file.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
from __future__ import absolute_import, division, print_function
22

3+
import json
4+
import os
5+
36
import openai
47
from openai import api_requestor, util
58
from openai.api_resources.abstract import (
@@ -29,3 +32,59 @@ def create(
2932
return util.convert_to_openai_object(
3033
response, api_key, api_version, organization
3134
)
35+
36+
@classmethod
37+
def download(
38+
cls, id, api_key=None, api_base=None, api_version=None, organization=None
39+
):
40+
requestor = api_requestor.APIRequestor(
41+
api_key,
42+
api_base=api_base or openai.file_api_base or openai.api_base,
43+
api_version=api_version,
44+
organization=organization,
45+
)
46+
url = f"{cls.class_url()}/{id}/content"
47+
rbody, rcode, rheaders, _, _ = requestor.request_raw("get", url)
48+
if not 200 <= rcode < 300:
49+
raise requestor.handle_error_response(
50+
rbody, rcode, json.loads(rbody), rheaders, stream_error=False
51+
)
52+
return rbody
53+
54+
@classmethod
55+
def find_matching_files(
56+
cls,
57+
api_key=None,
58+
api_base=None,
59+
api_version=None,
60+
organization=None,
61+
file=None,
62+
purpose=None,
63+
):
64+
if file is None:
65+
raise openai.error.InvalidRequestError(
66+
"'file' is a required property", "file"
67+
)
68+
if purpose is None:
69+
raise openai.error.InvalidRequestError(
70+
"'purpose' is a required property", "purpose"
71+
)
72+
all_files = cls.list(
73+
api_key=api_key,
74+
api_base=api_base or openai.file_api_base or openai.api_base,
75+
api_version=api_version,
76+
organization=organization,
77+
).get("data", [])
78+
matching_files = []
79+
for f in all_files:
80+
if f["purpose"] != purpose:
81+
continue
82+
if not hasattr(file, "name") or f["filename"] != file.name:
83+
continue
84+
file.seek(0, os.SEEK_END)
85+
if f["bytes"] != file.tell():
86+
file.seek(0)
87+
continue
88+
file.seek(0)
89+
matching_files.append(f)
90+
return matching_files

‎openai/cli.py

Lines changed: 225 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
import datetime
2-
import json
32
import os
43
import signal
54
import sys
65
import warnings
6+
from openai.validators import (
7+
write_out_file,
8+
apply_necessary_remediation,
9+
apply_optional_remediation,
10+
read_any_format,
11+
get_validators,
12+
)
713

814
import openai
915

@@ -221,12 +227,48 @@ def list(cls, args):
221227
print(resp)
222228

223229
@classmethod
224-
def _get_or_upload(cls, file):
230+
def _get_or_upload(cls, file, check_if_file_exists=True):
225231
try:
226232
openai.File.retrieve(file)
227233
except openai.error.InvalidRequestError as e:
228234
if e.http_status == 404 and os.path.isfile(file):
229-
resp = openai.File.create(file=open(file), purpose="fine-tune")
235+
matching_files = openai.File.find_matching_files(
236+
file=open(file), purpose="fine-tune"
237+
)
238+
if len(matching_files) > 0 and check_if_file_exists:
239+
file_ids = [f["id"] for f in matching_files]
240+
sys.stdout.write(
241+
"Found potentially duplicated files with name '{name}', purpose 'fine-tune' and size {size} bytes\n".format(
242+
name=matching_files[0]["filename"],
243+
size=matching_files[0]["bytes"],
244+
)
245+
)
246+
sys.stdout.write("\n".join(file_ids))
247+
while True:
248+
sys.stdout.write(
249+
"\nEnter file ID to reuse an already uploaded file, or an empty string to upload this file anyway: "
250+
)
251+
inp = sys.stdin.readline().strip()
252+
if inp in file_ids:
253+
sys.stdout.write(
254+
"Using your file {file}: {id}\n".format(
255+
file=file, id=inp
256+
)
257+
)
258+
return inp
259+
elif inp == "":
260+
break
261+
else:
262+
sys.stdout.write(
263+
"File id '{id}' is not among the IDs of the potentially duplicated files\n".format(
264+
id=inp
265+
)
266+
)
267+
268+
resp = openai.File.create(
269+
file=open(file),
270+
purpose="fine-tune",
271+
)
230272
sys.stdout.write(
231273
"Uploaded file from {file}: {id}\n".format(file=file, id=resp["id"])
232274
)
@@ -236,21 +278,38 @@ def _get_or_upload(cls, file):
236278
@classmethod
237279
def create(cls, args):
238280
create_args = {
239-
"training_file": cls._get_or_upload(args.training_file),
281+
"training_file": cls._get_or_upload(
282+
args.training_file, args.check_if_files_exist
283+
),
240284
}
241285
if args.validation_file:
242-
create_args["validation_file"] = cls._get_or_upload(args.validation_file)
286+
create_args["validation_file"] = cls._get_or_upload(
287+
args.validation_file, args.check_if_files_exist
288+
)
243289
if args.model:
244290
create_args["model"] = args.model
245-
if args.hparams:
246-
try:
247-
hparams = json.loads(args.hparams)
248-
except json.decoder.JSONDecodeError:
249-
sys.stderr.write(
250-
"--hparams must be JSON decodable and match the hyperparameter arguments of the API"
251-
)
252-
sys.exit(1)
253-
create_args.update(hparams)
291+
if args.n_epochs:
292+
create_args["n_epochs"] = args.n_epochs
293+
if args.batch_size:
294+
create_args["batch_size"] = args.batch_size
295+
if args.learning_rate_multiplier:
296+
create_args["learning_rate_multiplier"] = args.learning_rate_multiplier
297+
create_args["use_packing"] = args.use_packing
298+
if args.prompt_loss_weight:
299+
create_args["prompt_loss_weight"] = args.prompt_loss_weight
300+
if args.compute_classification_metrics:
301+
create_args[
302+
"compute_classification_metrics"
303+
] = args.compute_classification_metrics
304+
if args.classification_n_classes:
305+
create_args["classification_n_classes"] = args.classification_n_classes
306+
if args.classification_positive_class:
307+
create_args[
308+
"classification_positive_class"
309+
] = args.classification_positive_class
310+
if args.classification_betas:
311+
betas = [float(x) for x in args.classification_betas.split(",")]
312+
create_args["classification_betas"] = betas
254313

255314
resp = openai.FineTune.create(**create_args)
256315

@@ -271,6 +330,18 @@ def create(cls, args):
271330
def get(cls, args):
272331
resp = openai.FineTune.retrieve(id=args.id)
273332
print(resp)
333+
print(resp["result_files"][0])
334+
335+
@classmethod
336+
def results(cls, args):
337+
fine_tune = openai.FineTune.retrieve(id=args.id)
338+
if "result_files" not in fine_tune or len(fine_tune["result_files"]) == 0:
339+
raise openai.error.InvalidRequestError(
340+
f"No results file available for fine-tune {args.id}", "id"
341+
)
342+
result_file = openai.FineTune.retrieve(id=args.id)["result_files"][0]
343+
resp = openai.File.download(id=result_file["id"])
344+
print(resp.decode("utf-8"))
274345

275346
@classmethod
276347
def events(cls, args):
@@ -329,8 +400,69 @@ def cancel(cls, args):
329400
resp = openai.FineTune.cancel(id=args.id)
330401
print(resp)
331402

403+
@classmethod
404+
def prepare_data(cls, args):
405+
406+
sys.stdout.write("Analyzing...\n")
407+
fname = args.file
408+
df, remediation = read_any_format(fname)
409+
apply_necessary_remediation(None, remediation)
410+
411+
validators = get_validators()
412+
413+
optional_remediations = []
414+
if remediation is not None:
415+
optional_remediations.append(remediation)
416+
for validator in validators:
417+
remediation = validator(df)
418+
if remediation is not None:
419+
optional_remediations.append(remediation)
420+
df = apply_necessary_remediation(df, remediation)
421+
422+
any_optional_or_necessary_remediations = any(
423+
[
424+
remediation
425+
for remediation in optional_remediations
426+
if remediation.optional_msg is not None
427+
or remediation.necessary_msg is not None
428+
]
429+
)
430+
431+
if any_optional_or_necessary_remediations:
432+
sys.stdout.write(
433+
"\n\nBased on the analysis we will perform the following actions:\n"
434+
)
435+
436+
for remediation in optional_remediations:
437+
df = apply_optional_remediation(df, remediation)
438+
else:
439+
sys.stdout.write("\n\nNo remediations found.\n")
440+
441+
write_out_file(df, fname, any_optional_or_necessary_remediations)
442+
332443

333-
def register(parser):
444+
def tools_register(parser):
445+
subparsers = parser.add_subparsers(
446+
title="Tools", help="Convenience client side tools"
447+
)
448+
449+
def help(args):
450+
parser.print_help()
451+
452+
parser.set_defaults(func=help)
453+
454+
sub = subparsers.add_parser("fine_tunes.prepare_data")
455+
sub.add_argument(
456+
"-f",
457+
"--file",
458+
required=True,
459+
help="JSONL, JSON, CSV, TSV, TXT or XLSX file containing prompt-completion examples to be analyzed."
460+
"This should be the local file path.",
461+
)
462+
sub.set_defaults(func=FineTune.prepare_data)
463+
464+
465+
def api_register(parser):
334466
# Engine management
335467
subparsers = parser.add_subparsers(help="All API subcommands")
336468

@@ -544,6 +676,12 @@ def help(args):
544676
"be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345) "
545677
"or a local file path.",
546678
)
679+
sub.add_argument(
680+
"--no_check_if_files_exist",
681+
dest="check_if_files_exist",
682+
action="store_false",
683+
help="If this argument is set and training_file or validation_file are file paths, immediately upload them. If this argument is not set, check if they may be duplicates of already uploaded files before uploading, based on file name and file size.",
684+
)
547685
sub.add_argument(
548686
"-m",
549687
"--model",
@@ -554,13 +692,84 @@ def help(args):
554692
action="store_true",
555693
help="If set, returns immediately after creating the job. Otherwise, waits for the job to complete.",
556694
)
557-
sub.add_argument("-p", "--hparams", help="Hyperparameter JSON")
695+
sub.add_argument(
696+
"--n_epochs",
697+
type=int,
698+
help="The number of epochs to train the model for. An epoch refers to one "
699+
"full cycle through the training dataset.",
700+
)
701+
sub.add_argument(
702+
"--batch_size",
703+
type=int,
704+
help="The batch size to use for training. The batch size is the number of "
705+
"training examples used to train a single forward and backward pass.",
706+
)
707+
sub.add_argument(
708+
"--learning_rate_multiplier",
709+
type=float,
710+
help="The learning rate multiplier to use for training. The fine-tuning "
711+
"learning rate is determined by the original learning rate used for "
712+
"pretraining multiplied by this value",
713+
)
714+
sub.add_argument(
715+
"--use_packing",
716+
action="store_true",
717+
dest="use_packing",
718+
help="On classification tasks, we recommend not setting this flag. "
719+
"On all other tasks, we recommend setting it. "
720+
"When set, we pack as many prompt-completion pairs as possible into each "
721+
"training example. This greatly increases the speed of a fine-tuning job, "
722+
"often without negatively affecting model performance.",
723+
)
724+
sub.add_argument(
725+
"--no_packing",
726+
action="store_false",
727+
dest="use_packing",
728+
help="Disables the packing flag (see --use_packing for description)",
729+
)
730+
sub.set_defaults(use_packing=True)
731+
sub.add_argument(
732+
"--prompt_loss_weight",
733+
type=float,
734+
help="The weight to use for the prompt loss. The optimum value here depends "
735+
"depends on your use case. This determines how much the model prioritizes "
736+
"learning from prompt tokens vs learning from completion tokens",
737+
)
738+
sub.add_argument(
739+
"--compute_classification_metrics",
740+
action="store_true",
741+
help="If set, we calculate classification-specific metrics such as accuracy "
742+
"and F-1 score using the validation set at the end of every epoch.",
743+
)
744+
sub.add_argument(
745+
"--classification_n_classes",
746+
type=int,
747+
help="The number of classes in a classification task. This parameter is "
748+
"required for multiclass classification",
749+
)
750+
sub.add_argument(
751+
"--classification_positive_class",
752+
help="The positive class in binary classification. This parameter is needed "
753+
"to generate precision, recall and F-1 metrics when doing binary "
754+
"classification",
755+
)
756+
sub.add_argument(
757+
"--classification_betas",
758+
help="If this is provided, we calculate F-beta scores at the specified beta "
759+
"values. The F-beta score is a generalization of F-1 score. This is only "
760+
"used for binary classification. The expected format is a comma-separated "
761+
"list - e.g. 1,1.5,2",
762+
)
558763
sub.set_defaults(func=FineTune.create)
559764

560765
sub = subparsers.add_parser("fine_tunes.get")
561766
sub.add_argument("-i", "--id", required=True, help="The id of the fine-tune job")
562767
sub.set_defaults(func=FineTune.get)
563768

769+
sub = subparsers.add_parser("fine_tunes.results")
770+
sub.add_argument("-i", "--id", required=True, help="The id of the fine-tune job")
771+
sub.set_defaults(func=FineTune.results)
772+
564773
sub = subparsers.add_parser("fine_tunes.events")
565774
sub.add_argument("-i", "--id", required=True, help="The id of the fine-tune job")
566775
sub.add_argument(

‎openai/validators.py

Lines changed: 679 additions & 0 deletions
Large diffs are not rendered by default.

‎openai/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
VERSION = "0.8.0"
1+
VERSION = "0.9.0"

‎setup.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
install_requires=[
1717
"requests>=2.20", # to get the patch for CVE-2018-18074
1818
"tqdm", # Needed for progress bars
19+
"pandas>=1.2.3", # Needed for CLI fine-tuning data preparation tool
20+
"pandas-stubs>=1.1.0.11", # Needed for type hints for mypy
21+
"openpyxl>=3.0.7", # Needed for CLI fine-tuning data preparation tool xlsx format
1922
],
2023
extras_require={"dev": ["black==20.8b1", "pytest==6.*"]},
2124
python_requires=">=3.6",

0 commit comments

Comments
 (0)
Please sign in to comment.