Skip to content

Commit abdb2fc

Browse files
committed
dataset utils
1 parent 49b8e2e commit abdb2fc

File tree

4 files changed

+1146
-0
lines changed

4 files changed

+1146
-0
lines changed

ch07/02_dataset-utilities/README.md

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Chapter 7: Instruction and Preference Finetuning
2+
3+
This folder contains utility code that can be used for preparing an instruction dataset.
4+
5+
6+
7+
### Finding near duplicates
8+
9+
The `find-near-duplicates.py` function can be used to identify duplicates and near-duplicates in an instruction dataset. For example,
10+
11+
12+
13+
```python
14+
python find-near-duplicates.py --json_file instruction-examples.json
15+
```
16+
17+
```
18+
19+
20+
==================================================
21+
Searching 'instruction' for duplicates ...
22+
==================================================
23+
Duplicate pair found with similarity 0.85:
24+
1. Determine the state of matter for helium at room temperature.
25+
2. Determine the state of matter for nitrogen at room temperature.
26+
27+
Duplicate pair found with similarity 0.98:
28+
1. Edit the following sentence to make it more formal.
29+
2. Edit the sentence to make it more formal.
30+
31+
Duplicate pair found with similarity 1.00:
32+
1. Name a dwarf planet in our solar system.
33+
2. Name a dwarf planet in our solar system.
34+
35+
Duplicate pair found with similarity 0.88:
36+
1. Change the sentences from active voice to passive voice.
37+
2. Change the sentence from passive to active voice.
38+
39+
40+
41+
==================================================
42+
Searching 'input' for duplicates ...
43+
==================================================
44+
Duplicate pair found with similarity 0.88:
45+
1.
46+
2. She said, "I am tired."
47+
48+
49+
50+
==================================================
51+
Searching 'output' for duplicates ...
52+
==================================================
53+
Duplicate pair found with similarity 0.82:
54+
1. Helium is in a gaseous state at room temperature.
55+
2. Nitrogen is in a gaseous state at room temperature.
56+
57+
Duplicate pair found with similarity 1.00:
58+
1. One dwarf planet in our solar system is Pluto.
59+
2. One dwarf planet in our solar system is Pluto.
60+
61+
62+
```
63+
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
2+
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
3+
# Source for "Build a Large Language Model From Scratch"
4+
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
5+
# Code: https://github.com/rasbt/LLMs-from-scratch
6+
7+
import argparse
8+
import json
9+
from sklearn.feature_extraction.text import TfidfVectorizer
10+
from sklearn.metrics.pairwise import cosine_similarity
11+
12+
13+
# Sample JSON dataset
14+
example_data = [
15+
{"instruction": "What is the capital of Italy?", "input": "", "output": "The capital of Italy is Rome."},
16+
{"instruction": "What's the capital city of Italy?", "input": "", "output": "The capital city is Rome."},
17+
{"instruction": "Identify the main verb in the sentence: 'The cat sleeps on the couch.'", "input": "", "output": "The verb is 'sleeps'."},
18+
{"instruction": "Identify the verb in the following sentence: The cat sleeps on the couch.", "input": "", "output": "The verb in the sentence is \"sleeps.\""},
19+
# Add other entries...
20+
]
21+
22+
23+
def find_near_duplicates(json_data, threshold=0.8, key="instruction"):
24+
"""The higher the threshold, the more similar the texts have to be to match"""
25+
26+
# Extract instructions
27+
text = [item[key] for item in json_data if item[key]]
28+
near_duplicates = []
29+
30+
if not text:
31+
return near_duplicates
32+
33+
# Vectorize the text data
34+
vectorizer = TfidfVectorizer(stop_words=None)
35+
tfidf_matrix = vectorizer.fit_transform(text)
36+
37+
# Compute cosine similarity between each pair of entries
38+
cos_sim_matrix = cosine_similarity(tfidf_matrix)
39+
40+
# Find pairs of near-duplicate instructions based on the threshold
41+
42+
for i in range(len(cos_sim_matrix)):
43+
for j in range(i+1, len(cos_sim_matrix)):
44+
if cos_sim_matrix[i, j] > threshold:
45+
near_duplicates.append((json_data[i], json_data[j], cos_sim_matrix[i, j]))
46+
47+
return near_duplicates
48+
49+
50+
def find_and_print_new_duplicates(json_data):
51+
for key in json_data[0].keys():
52+
near_duplicates = find_near_duplicates(json_data, key=key)
53+
print(f"\n\n{50*'='}\n Searching '{key}' for duplicates ...\n{50*'='}")
54+
if not near_duplicates:
55+
print("No duplicates found")
56+
else:
57+
for dup in near_duplicates:
58+
print(f"Duplicate pair found with similarity {dup[2]:.2f}:\n"
59+
f"1. {dup[0][key]}\n2. {dup[1][key]}\n")
60+
61+
62+
if __name__ == "__main__":
63+
64+
parser = argparse.ArgumentParser()
65+
parser.add_argument(
66+
"--json_file",
67+
type=str,
68+
help=("Path to the dataset JSON file")
69+
)
70+
args = parser.parse_args()
71+
if not args.json_file:
72+
json_data = example_data
73+
74+
else:
75+
with open(args.json_file, "r") as file:
76+
json_data = json.load(file)
77+
78+
find_and_print_new_duplicates(json_data)

0 commit comments

Comments
 (0)