1
+ # https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python
2
+ # https://www.udemy.com/data-science-natural-language-processing-in-python
3
+
4
+ # Author: http://lazyprogrammer.me
5
+ from __future__ import print_function , division
6
+ from future .utils import iteritems
7
+ from builtins import range
8
+ # Note: you may need to update your version of future
9
+ # sudo pip install -U future
10
+
11
+
12
+ from gensim .models import KeyedVectors
13
+
14
+
15
+ # warning: takes quite awhile
16
+ # https://code.google.com/archive/p/word2vec/
17
+ # direct link: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing
18
+ # 3 million words and phrases
19
+ # D = 300
20
+ word_vectors = KeyedVectors .load_word2vec_format (
21
+ '../large_files/GoogleNews-vectors-negative300.bin' ,
22
+ binary = True
23
+ )
24
+
25
+
26
+ # convenience
27
+ # result looks like:
28
+ # [('athens', 0.6001024842262268),
29
+ # ('albert', 0.5729557275772095),
30
+ # ('holmes', 0.569324254989624),
31
+ # ('donnie', 0.5690680742263794),
32
+ # ('italy', 0.5673537254333496),
33
+ # ('toni', 0.5666348338127136),
34
+ # ('spain', 0.5661854147911072),
35
+ # ('jh', 0.5661597847938538),
36
+ # ('pablo', 0.5631559491157532),
37
+ # ('malta', 0.5620371103286743)]
38
+ def find_analogies (w1 , w2 , w3 ):
39
+ r = word_vectors .most_similar (positive = [w1 , w3 ], negative = [w2 ])
40
+ print ("%s - %s = %s - %s" % (w1 , w2 , r [0 ][0 ], w3 ))
41
+
42
+ def nearest_neighbors (w ):
43
+ r = word_vectors .most_similar (positive = [w ])
44
+ print ("neighbors of: %s" % w )
45
+ for word , score in r :
46
+ print ("\t %s" % word )
47
+
48
+
49
+ find_analogies ('king' , 'man' , 'woman' )
50
+ find_analogies ('france' , 'paris' , 'london' )
51
+ find_analogies ('france' , 'paris' , 'rome' )
52
+ find_analogies ('paris' , 'france' , 'italy' )
53
+ find_analogies ('france' , 'french' , 'english' )
54
+ find_analogies ('japan' , 'japanese' , 'chinese' )
55
+ find_analogies ('japan' , 'japanese' , 'italian' )
56
+ find_analogies ('japan' , 'japanese' , 'australian' )
57
+ find_analogies ('december' , 'november' , 'june' )
58
+ find_analogies ('miami' , 'florida' , 'texas' )
59
+ find_analogies ('einstein' , 'scientist' , 'painter' )
60
+ find_analogies ('china' , 'rice' , 'bread' )
61
+ find_analogies ('man' , 'woman' , 'she' )
62
+ find_analogies ('man' , 'woman' , 'aunt' )
63
+ find_analogies ('man' , 'woman' , 'sister' )
64
+ find_analogies ('man' , 'woman' , 'wife' )
65
+ find_analogies ('man' , 'woman' , 'actress' )
66
+ find_analogies ('man' , 'woman' , 'mother' )
67
+ find_analogies ('heir' , 'heiress' , 'princess' )
68
+ find_analogies ('nephew' , 'niece' , 'aunt' )
69
+ find_analogies ('france' , 'paris' , 'tokyo' )
70
+ find_analogies ('france' , 'paris' , 'beijing' )
71
+ find_analogies ('february' , 'january' , 'november' )
72
+ find_analogies ('france' , 'paris' , 'rome' )
73
+ find_analogies ('paris' , 'france' , 'italy' )
74
+
75
+ nearest_neighbors ('king' )
76
+ nearest_neighbors ('france' )
77
+ nearest_neighbors ('japan' )
78
+ nearest_neighbors ('einstein' )
79
+ nearest_neighbors ('woman' )
80
+ nearest_neighbors ('nephew' )
81
+ nearest_neighbors ('february' )
82
+ nearest_neighbors ('rome' )
0 commit comments