|
207 | 207 | "2. **Number of Messages Per Example**: Summarizes the distribution of the number of messages in each conversation, providing insight into dialogue complexity.\n",
|
208 | 208 | "3. **Total Tokens Per Example**: Calculates and summarizes the distribution of the total number of tokens in each conversation. Important for understanding fine-tuning costs.\n",
|
209 | 209 | "4. **Tokens in Assistant's Messages**: Calculates the number of tokens in the assistant's messages per conversation and summarizes this distribution. Useful for understanding the assistant's verbosity.\n",
|
210 |
| - "5. **Token Limit Warnings**: Checks if any examples exceed the maximum token limit (4096 tokens), as such examples will be truncated during fine-tuning, potentially resulting in data loss.\n" |
| 210 | + "5. **Token Limit Warnings**: Checks if any examples exceed the maximum token limit (16,385 tokens), as such examples will be truncated during fine-tuning, potentially resulting in data loss.\n" |
211 | 211 | ]
|
212 | 212 | },
|
213 | 213 | {
|
|
240 | 240 | "mean / median: 1610.2, 10.0\n",
|
241 | 241 | "p5 / p95: 6.0, 4811.200000000001\n",
|
242 | 242 | "\n",
|
243 |
| - "1 examples may be over the 4096 token limit, they will be truncated during fine-tuning\n" |
| 243 | + "0 examples may be over the 16,385 token limit, they will be truncated during fine-tuning\n" |
244 | 244 | ]
|
245 | 245 | }
|
246 | 246 | ],
|
|
267 | 267 | "print_distribution(n_messages, \"num_messages_per_example\")\n",
|
268 | 268 | "print_distribution(convo_lens, \"num_total_tokens_per_example\")\n",
|
269 | 269 | "print_distribution(assistant_message_lens, \"num_assistant_tokens_per_example\")\n",
|
270 |
| - "n_too_long = sum(l > 4096 for l in convo_lens)\n", |
271 |
| - "print(f\"\\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning\")" |
| 270 | + "n_too_long = sum(l > 16,385 for l in convo_lens)\n", |
| 271 | + "print(f\"\\n{n_too_long} examples may be over the 16,385 token limit, they will be truncated during fine-tuning\")" |
272 | 272 | ]
|
273 | 273 | },
|
274 | 274 | {
|
|
300 | 300 | ],
|
301 | 301 | "source": [
|
302 | 302 | "# Pricing and default n_epochs estimate\n",
|
303 |
| - "MAX_TOKENS_PER_EXAMPLE = 4096\n", |
| 303 | + "MAX_TOKENS_PER_EXAMPLE = 16385\n", |
304 | 304 | "\n",
|
305 | 305 | "TARGET_EPOCHS = 3\n",
|
306 | 306 | "MIN_TARGET_EXAMPLES = 100\n",
|
|
0 commit comments