update

bob7783 · bob7783 · commit 4a52b07e163a · 2018-07-05T13:26:47.000-04:00
diff --git a/nlp_class2/bow_classifier.py b/nlp_class2/bow_classifier.py
@@ -0,0 +1,146 @@
+# Course URL:
+# https://deeplearningcourses.com/c/natural-language-processing-with-deep-learning-in-python
+# https://udemy.com/natural-language-processing-with-deep-learning-in-python
+from __future__ import print_function, division
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+
+import sys
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
+from gensim.models import KeyedVectors
+
+
+# data from https://www.cs.umb.edu/~smimarog/textmining/datasets/
+train = pd.read_csv('../large_files/r8-train-all-terms.txt', header=None, sep='\t')
+test = pd.read_csv('../large_files/r8-test-all-terms.txt', header=None, sep='\t')
+train.columns = ['label', 'content']
+test.columns = ['label', 'content']
+
+
+
+class GloveVectorizer:
+  def __init__(self):
+    # load in pre-trained word vectors
+    print('Loading word vectors...')
+    word2vec = {}
+    embedding = []
+    idx2word = []
+    with open('../large_files/glove.6B/glove.6B.50d.txt') as f:
+      # is just a space-separated text file in the format:
+      # word vec[0] vec[1] vec[2] ...
+      for line in f:
+        values = line.split()
+        word = values[0]
+        vec = np.asarray(values[1:], dtype='float32')
+        word2vec[word] = vec
+        embedding.append(vec)
+        idx2word.append(word)
+    print('Found %s word vectors.' % len(word2vec))
+
+    # save for later
+    self.word2vec = word2vec
+    self.embedding = np.array(embedding)
+    self.word2idx = {v:k for k,v in enumerate(idx2word)}
+    self.V, self.D = self.embedding.shape
+
+  def fit(self, data):
+    pass
+
+  def transform(self, data):
+    X = np.zeros((len(data), self.D))
+    n = 0
+    emptycount = 0
+    for sentence in data:
+      tokens = sentence.lower().split()
+      vecs = []
+      for word in tokens:
+        if word in self.word2vec:
+          vec = self.word2vec[word]
+          vecs.append(vec)
+      if len(vecs) > 0:
+        vecs = np.array(vecs)
+        X[n] = vecs.mean(axis=0)
+      else:
+        emptycount += 1
+      n += 1
+    print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
+    return X
+
+  def fit_transform(self, data):
+    self.fit(data)
+    return self.transform(data)
+
+
+
+
+class Word2VecVectorizer:
+  def __init__(self):
+    print("Loading in word vectors...")
+    self.word_vectors = KeyedVectors.load_word2vec_format(
+      '../large_files/GoogleNews-vectors-negative300.bin',
+      binary=True
+    )
+    print("Finished loading in word vectors")
+
+  def fit(self, data):
+    pass
+
+  def transform(self, data):
+    # determine the dimensionality of vectors
+    v = self.word_vectors.get_vector('king')
+    self.D = v.shape[0]
+
+    X = np.zeros((len(data), self.D))
+    n = 0
+    emptycount = 0
+    for sentence in data:
+      tokens = sentence.split()
+      vecs = []
+      m = 0
+      for word in tokens:
+        try:
+          # throws KeyError if word not found
+          vec = self.word_vectors.get_vector(word)
+          vecs.append(vec)
+          m += 1
+        except KeyError:
+          pass
+      if len(vecs) > 0:
+        vecs = np.array(vecs)
+        X[n] = vecs.mean(axis=0)
+      else:
+        emptycount += 1
+      n += 1
+    print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
+    return X
+
+
+  def fit_transform(self, data):
+    self.fit(data)
+    return self.transform(data)
+
+
+
+vectorizer = GloveVectorizer()
+# vectorizer = Word2VecVectorizer()
+Xtrain = vectorizer.fit_transform(train.content)
+Ytrain = train.label
+
+Xtest = vectorizer.transform(test.content)
+Ytest = test.label
+
+
+
+# create the model, train it, print scores
+model = RandomForestClassifier(n_estimators=200)
+model.fit(Xtrain, Ytrain)
+print("train score:", model.score(Xtrain, Ytrain))
+print("test score:", model.score(Xtest, Ytest))
+
+
diff --git a/nlp_class2/pretrained_w2v.py b/nlp_class2/pretrained_w2v.py
@@ -0,0 +1,82 @@
+# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python
+# https://www.udemy.com/data-science-natural-language-processing-in-python
+
+# Author: http://lazyprogrammer.me
+from __future__ import print_function, division
+from future.utils import iteritems
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+
+from gensim.models import KeyedVectors
+
+
+# warning: takes quite awhile
+# https://code.google.com/archive/p/word2vec/
+# direct link: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing
+# 3 million words and phrases
+# D = 300
+word_vectors = KeyedVectors.load_word2vec_format(
+  '../large_files/GoogleNews-vectors-negative300.bin',
+  binary=True
+)
+
+
+# convenience
+# result looks like:
+# [('athens', 0.6001024842262268),
+#  ('albert', 0.5729557275772095),
+#  ('holmes', 0.569324254989624),
+#  ('donnie', 0.5690680742263794),
+#  ('italy', 0.5673537254333496),
+#  ('toni', 0.5666348338127136),
+#  ('spain', 0.5661854147911072),
+#  ('jh', 0.5661597847938538),
+#  ('pablo', 0.5631559491157532),
+#  ('malta', 0.5620371103286743)]
+def find_analogies(w1, w2, w3):
+  r = word_vectors.most_similar(positive=[w1, w3], negative=[w2])
+  print("%s - %s = %s - %s" % (w1, w2, r[0][0], w3))
+
+def nearest_neighbors(w):
+  r = word_vectors.most_similar(positive=[w])
+  print("neighbors of: %s" % w)
+  for word, score in r:
+    print("\t%s" % word)
+
+
+find_analogies('king', 'man', 'woman')
+find_analogies('france', 'paris', 'london')
+find_analogies('france', 'paris', 'rome')
+find_analogies('paris', 'france', 'italy')
+find_analogies('france', 'french', 'english')
+find_analogies('japan', 'japanese', 'chinese')
+find_analogies('japan', 'japanese', 'italian')
+find_analogies('japan', 'japanese', 'australian')
+find_analogies('december', 'november', 'june')
+find_analogies('miami', 'florida', 'texas')
+find_analogies('einstein', 'scientist', 'painter')
+find_analogies('china', 'rice', 'bread')
+find_analogies('man', 'woman', 'she')
+find_analogies('man', 'woman', 'aunt')
+find_analogies('man', 'woman', 'sister')
+find_analogies('man', 'woman', 'wife')
+find_analogies('man', 'woman', 'actress')
+find_analogies('man', 'woman', 'mother')
+find_analogies('heir', 'heiress', 'princess')
+find_analogies('nephew', 'niece', 'aunt')
+find_analogies('france', 'paris', 'tokyo')
+find_analogies('france', 'paris', 'beijing')
+find_analogies('february', 'january', 'november')
+find_analogies('france', 'paris', 'rome')
+find_analogies('paris', 'france', 'italy')
+
+nearest_neighbors('king')
+nearest_neighbors('france')
+nearest_neighbors('japan')
+nearest_neighbors('einstein')
+nearest_neighbors('woman')
+nearest_neighbors('nephew')
+nearest_neighbors('february')
+nearest_neighbors('rome')