Skip to content

Commit 4a52b07

Browse files
committed
update
1 parent 8e4e8f0 commit 4a52b07

File tree

2 files changed

+228
-0
lines changed

2 files changed

+228
-0
lines changed

nlp_class2/bow_classifier.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
# Course URL:
2+
# https://deeplearningcourses.com/c/natural-language-processing-with-deep-learning-in-python
3+
# https://udemy.com/natural-language-processing-with-deep-learning-in-python
4+
from __future__ import print_function, division
5+
from builtins import range
6+
# Note: you may need to update your version of future
7+
# sudo pip install -U future
8+
9+
10+
import sys
11+
import numpy as np
12+
import pandas as pd
13+
import matplotlib.pyplot as plt
14+
15+
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
16+
from gensim.models import KeyedVectors
17+
18+
19+
# data from https://www.cs.umb.edu/~smimarog/textmining/datasets/
20+
train = pd.read_csv('../large_files/r8-train-all-terms.txt', header=None, sep='\t')
21+
test = pd.read_csv('../large_files/r8-test-all-terms.txt', header=None, sep='\t')
22+
train.columns = ['label', 'content']
23+
test.columns = ['label', 'content']
24+
25+
26+
27+
class GloveVectorizer:
28+
def __init__(self):
29+
# load in pre-trained word vectors
30+
print('Loading word vectors...')
31+
word2vec = {}
32+
embedding = []
33+
idx2word = []
34+
with open('../large_files/glove.6B/glove.6B.50d.txt') as f:
35+
# is just a space-separated text file in the format:
36+
# word vec[0] vec[1] vec[2] ...
37+
for line in f:
38+
values = line.split()
39+
word = values[0]
40+
vec = np.asarray(values[1:], dtype='float32')
41+
word2vec[word] = vec
42+
embedding.append(vec)
43+
idx2word.append(word)
44+
print('Found %s word vectors.' % len(word2vec))
45+
46+
# save for later
47+
self.word2vec = word2vec
48+
self.embedding = np.array(embedding)
49+
self.word2idx = {v:k for k,v in enumerate(idx2word)}
50+
self.V, self.D = self.embedding.shape
51+
52+
def fit(self, data):
53+
pass
54+
55+
def transform(self, data):
56+
X = np.zeros((len(data), self.D))
57+
n = 0
58+
emptycount = 0
59+
for sentence in data:
60+
tokens = sentence.lower().split()
61+
vecs = []
62+
for word in tokens:
63+
if word in self.word2vec:
64+
vec = self.word2vec[word]
65+
vecs.append(vec)
66+
if len(vecs) > 0:
67+
vecs = np.array(vecs)
68+
X[n] = vecs.mean(axis=0)
69+
else:
70+
emptycount += 1
71+
n += 1
72+
print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
73+
return X
74+
75+
def fit_transform(self, data):
76+
self.fit(data)
77+
return self.transform(data)
78+
79+
80+
81+
82+
class Word2VecVectorizer:
83+
def __init__(self):
84+
print("Loading in word vectors...")
85+
self.word_vectors = KeyedVectors.load_word2vec_format(
86+
'../large_files/GoogleNews-vectors-negative300.bin',
87+
binary=True
88+
)
89+
print("Finished loading in word vectors")
90+
91+
def fit(self, data):
92+
pass
93+
94+
def transform(self, data):
95+
# determine the dimensionality of vectors
96+
v = self.word_vectors.get_vector('king')
97+
self.D = v.shape[0]
98+
99+
X = np.zeros((len(data), self.D))
100+
n = 0
101+
emptycount = 0
102+
for sentence in data:
103+
tokens = sentence.split()
104+
vecs = []
105+
m = 0
106+
for word in tokens:
107+
try:
108+
# throws KeyError if word not found
109+
vec = self.word_vectors.get_vector(word)
110+
vecs.append(vec)
111+
m += 1
112+
except KeyError:
113+
pass
114+
if len(vecs) > 0:
115+
vecs = np.array(vecs)
116+
X[n] = vecs.mean(axis=0)
117+
else:
118+
emptycount += 1
119+
n += 1
120+
print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
121+
return X
122+
123+
124+
def fit_transform(self, data):
125+
self.fit(data)
126+
return self.transform(data)
127+
128+
129+
130+
vectorizer = GloveVectorizer()
131+
# vectorizer = Word2VecVectorizer()
132+
Xtrain = vectorizer.fit_transform(train.content)
133+
Ytrain = train.label
134+
135+
Xtest = vectorizer.transform(test.content)
136+
Ytest = test.label
137+
138+
139+
140+
# create the model, train it, print scores
141+
model = RandomForestClassifier(n_estimators=200)
142+
model.fit(Xtrain, Ytrain)
143+
print("train score:", model.score(Xtrain, Ytrain))
144+
print("test score:", model.score(Xtest, Ytest))
145+
146+

nlp_class2/pretrained_w2v.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python
2+
# https://www.udemy.com/data-science-natural-language-processing-in-python
3+
4+
# Author: http://lazyprogrammer.me
5+
from __future__ import print_function, division
6+
from future.utils import iteritems
7+
from builtins import range
8+
# Note: you may need to update your version of future
9+
# sudo pip install -U future
10+
11+
12+
from gensim.models import KeyedVectors
13+
14+
15+
# warning: takes quite awhile
16+
# https://code.google.com/archive/p/word2vec/
17+
# direct link: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing
18+
# 3 million words and phrases
19+
# D = 300
20+
word_vectors = KeyedVectors.load_word2vec_format(
21+
'../large_files/GoogleNews-vectors-negative300.bin',
22+
binary=True
23+
)
24+
25+
26+
# convenience
27+
# result looks like:
28+
# [('athens', 0.6001024842262268),
29+
# ('albert', 0.5729557275772095),
30+
# ('holmes', 0.569324254989624),
31+
# ('donnie', 0.5690680742263794),
32+
# ('italy', 0.5673537254333496),
33+
# ('toni', 0.5666348338127136),
34+
# ('spain', 0.5661854147911072),
35+
# ('jh', 0.5661597847938538),
36+
# ('pablo', 0.5631559491157532),
37+
# ('malta', 0.5620371103286743)]
38+
def find_analogies(w1, w2, w3):
39+
r = word_vectors.most_similar(positive=[w1, w3], negative=[w2])
40+
print("%s - %s = %s - %s" % (w1, w2, r[0][0], w3))
41+
42+
def nearest_neighbors(w):
43+
r = word_vectors.most_similar(positive=[w])
44+
print("neighbors of: %s" % w)
45+
for word, score in r:
46+
print("\t%s" % word)
47+
48+
49+
find_analogies('king', 'man', 'woman')
50+
find_analogies('france', 'paris', 'london')
51+
find_analogies('france', 'paris', 'rome')
52+
find_analogies('paris', 'france', 'italy')
53+
find_analogies('france', 'french', 'english')
54+
find_analogies('japan', 'japanese', 'chinese')
55+
find_analogies('japan', 'japanese', 'italian')
56+
find_analogies('japan', 'japanese', 'australian')
57+
find_analogies('december', 'november', 'june')
58+
find_analogies('miami', 'florida', 'texas')
59+
find_analogies('einstein', 'scientist', 'painter')
60+
find_analogies('china', 'rice', 'bread')
61+
find_analogies('man', 'woman', 'she')
62+
find_analogies('man', 'woman', 'aunt')
63+
find_analogies('man', 'woman', 'sister')
64+
find_analogies('man', 'woman', 'wife')
65+
find_analogies('man', 'woman', 'actress')
66+
find_analogies('man', 'woman', 'mother')
67+
find_analogies('heir', 'heiress', 'princess')
68+
find_analogies('nephew', 'niece', 'aunt')
69+
find_analogies('france', 'paris', 'tokyo')
70+
find_analogies('france', 'paris', 'beijing')
71+
find_analogies('february', 'january', 'november')
72+
find_analogies('france', 'paris', 'rome')
73+
find_analogies('paris', 'france', 'italy')
74+
75+
nearest_neighbors('king')
76+
nearest_neighbors('france')
77+
nearest_neighbors('japan')
78+
nearest_neighbors('einstein')
79+
nearest_neighbors('woman')
80+
nearest_neighbors('nephew')
81+
nearest_neighbors('february')
82+
nearest_neighbors('rome')

0 commit comments

Comments
 (0)