Skip to content

Commit b4fc30d

Browse files
BorisPowerrachellim
authored andcommitted
A file verification and remediation tool.
It applies the following validations: - prints the number of examples, and warns if it's lower than 100 - ensures prompt and completion columns are present - optionally removes any additional columns - ensures all completions are non-empty - infers which type of fine-tuning the data is most likely in (classification, conditional generation and open-ended generation) - optionally removes duplicate rows - infers the existence of a common suffix, and if there is none, suggests one for classification and conditional generation - optionally prepends a space to each completion, to make tokenization better - optionally splits into training and validation set for the classification use case - optionally ensures there's an ending string for all completions - optionally lowercases completions or prompts if more than a 1/3 of alphanumeric characters are upper case It interactively asks the user to accept or reject recommendations. If the user is happy, then it saves the modified output file as a jsonl, which is ready for being used in fine-tuning with the printed command.
1 parent 7384a76 commit b4fc30d

File tree

4 files changed

+756
-6
lines changed

4 files changed

+756
-6
lines changed

bin/openai

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
#!/usr/bin/env python
22
import argparse
3-
import json
43
import logging
5-
import os
64
import sys
75

86
import openai
97
from openai.cli import display_error
10-
from openai.cli import register as api_register
8+
from openai.cli import api_register, tools_register
119

1210
logger = logging.getLogger()
1311
formatter = logging.Formatter("[%(asctime)s] %(message)s")
@@ -40,9 +38,11 @@ def main():
4038
parser.set_defaults(func=help)
4139

4240
subparsers = parser.add_subparsers()
43-
sub = subparsers.add_parser("api", help="Direct API calls")
41+
sub_api = subparsers.add_parser("api", help="Direct API calls")
42+
sub_tools = subparsers.add_parser("tools", help="Client side tools for convenience")
4443

45-
api_register(sub)
44+
api_register(sub_api)
45+
tools_register(sub_tools)
4646

4747
args = parser.parse_args()
4848
if args.verbosity == 1:

openai/cli.py

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,13 @@
33
import signal
44
import sys
55
import warnings
6+
from openai.validators import (
7+
write_out_file,
8+
apply_necessary_remediation,
9+
apply_optional_remediation,
10+
read_any_format,
11+
get_validators,
12+
)
613

714
import openai
815

@@ -393,8 +400,69 @@ def cancel(cls, args):
393400
resp = openai.FineTune.cancel(id=args.id)
394401
print(resp)
395402

403+
@classmethod
404+
def prepare_data(cls, args):
405+
406+
sys.stdout.write("Analyzing...\n")
407+
fname = args.file
408+
df, remediation = read_any_format(fname)
409+
apply_necessary_remediation(None, remediation)
410+
411+
validators = get_validators()
412+
413+
optional_remediations = []
414+
if remediation is not None:
415+
optional_remediations.append(remediation)
416+
for validator in validators:
417+
remediation = validator(df)
418+
if remediation is not None:
419+
optional_remediations.append(remediation)
420+
df = apply_necessary_remediation(df, remediation)
421+
422+
any_optional_or_necessary_remediations = any(
423+
[
424+
remediation
425+
for remediation in optional_remediations
426+
if remediation.optional_msg is not None
427+
or remediation.necessary_msg is not None
428+
]
429+
)
430+
431+
if any_optional_or_necessary_remediations:
432+
sys.stdout.write(
433+
"\n\nBased on the analysis we will perform the following actions:\n"
434+
)
435+
436+
for remediation in optional_remediations:
437+
df = apply_optional_remediation(df, remediation)
438+
else:
439+
sys.stdout.write("\n\nNo remediations found.\n")
440+
441+
write_out_file(df, fname, any_optional_or_necessary_remediations)
442+
443+
444+
def tools_register(parser):
445+
subparsers = parser.add_subparsers(
446+
title="Tools", help="Convenience client side tools"
447+
)
448+
449+
def help(args):
450+
parser.print_help()
451+
452+
parser.set_defaults(func=help)
453+
454+
sub = subparsers.add_parser("fine_tunes.prepare_data")
455+
sub.add_argument(
456+
"-f",
457+
"--file",
458+
required=True,
459+
help="JSONL, JSON, CSV, TSV, TXT or XLSX file containing prompt-completion examples to be analyzed."
460+
"This should be the local file path.",
461+
)
462+
sub.set_defaults(func=FineTune.prepare_data)
463+
396464

397-
def register(parser):
465+
def api_register(parser):
398466
# Engine management
399467
subparsers = parser.add_subparsers(help="All API subcommands")
400468

0 commit comments

Comments
 (0)