Skip to content

Commit c3a4b3c

Browse files
committed
nlp3
1 parent 232f10e commit c3a4b3c

File tree

10 files changed

+2020
-0
lines changed

10 files changed

+2020
-0
lines changed

nlp_class3/attention.py

Lines changed: 460 additions & 0 deletions
Large diffs are not rendered by default.

nlp_class3/bilstm_mnist.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# https://deeplearningcourses.com/c/deep-learning-advanced-nlp
2+
from __future__ import print_function, division
3+
from builtins import range, input
4+
# Note: you may need to update your version of future
5+
# sudo pip install -U future
6+
7+
8+
import os
9+
from keras.models import Model
10+
from keras.layers import Input, LSTM, GRU, Bidirectional, GlobalMaxPooling1D, Lambda, Concatenate, Dense
11+
import keras.backend as K
12+
import numpy as np
13+
import pandas as pd
14+
import matplotlib.pyplot as plt
15+
16+
17+
def get_mnist(limit=None):
18+
if not os.path.exists('../large_files'):
19+
print("You must create a folder called large_files adjacent to the class folder first.")
20+
if not os.path.exists('../large_files/train.csv'):
21+
print("Looks like you haven't downloaded the data or it's not in the right spot.")
22+
print("Please get train.csv from https://www.kaggle.com/c/digit-recognizer")
23+
print("and place it in the large_files folder.")
24+
25+
print("Reading in and transforming data...")
26+
df = pd.read_csv('../large_files/train.csv')
27+
data = df.as_matrix()
28+
np.random.shuffle(data)
29+
X = data[:, 1:].reshape(-1, 28, 28) / 255.0 # data is from 0..255
30+
Y = data[:, 0]
31+
if limit is not None:
32+
X, Y = X[:limit], Y[:limit]
33+
return X, Y
34+
35+
36+
37+
38+
# get data
39+
X, Y = get_mnist()
40+
41+
# config
42+
D = 28
43+
M = 15
44+
45+
46+
# input is an image of size 28x28
47+
input_ = Input(shape=(D, D))
48+
49+
# up-down
50+
rnn1 = Bidirectional(LSTM(M, return_sequences=True))
51+
x1 = rnn1(input_) # output is N x D x 2M
52+
x1 = GlobalMaxPooling1D()(x1) # output is N x 2M
53+
54+
# left-right
55+
rnn2 = Bidirectional(LSTM(M, return_sequences=True))
56+
57+
# custom layer
58+
permutor = Lambda(lambda t: K.permute_dimensions(t, pattern=(0, 2, 1)))
59+
60+
x2 = permutor(input_)
61+
x2 = rnn2(x2) # output is N x D x 2M
62+
x2 = GlobalMaxPooling1D()(x2) # output is N x 2M
63+
64+
# put them together
65+
concatenator = Concatenate(axis=1)
66+
x = concatenator([x1, x2]) # output is N x 4M
67+
68+
# final dense layer
69+
output = Dense(10, activation='softmax')(x)
70+
71+
model = Model(inputs=input_, outputs=output)
72+
73+
# testing
74+
# o = model.predict(X)
75+
# print("o.shape:", o.shape)
76+
77+
# compile
78+
model.compile(
79+
loss='sparse_categorical_crossentropy',
80+
optimizer='adam',
81+
metrics=['accuracy']
82+
)
83+
84+
# train
85+
print('Training model...')
86+
r = model.fit(X, Y, batch_size=32, epochs=10, validation_split=0.3)
87+
88+
89+
# plot some data
90+
plt.plot(r.history['loss'], label='loss')
91+
plt.plot(r.history['val_loss'], label='val_loss')
92+
plt.legend()
93+
plt.show()
94+
95+
# accuracies
96+
plt.plot(r.history['acc'], label='acc')
97+
plt.plot(r.history['val_acc'], label='val_acc')
98+
plt.legend()
99+
plt.show()
100+

nlp_class3/bilstm_test.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# https://deeplearningcourses.com/c/deep-learning-advanced-nlp
2+
from __future__ import print_function, division
3+
from builtins import range, input
4+
# Note: you may need to update your version of future
5+
# sudo pip install -U future
6+
7+
from keras.models import Model
8+
from keras.layers import Input, LSTM, GRU, Bidirectional
9+
import numpy as np
10+
import matplotlib.pyplot as plt
11+
12+
13+
T = 8
14+
D = 2
15+
M = 3
16+
17+
18+
X = np.random.randn(1, T, D)
19+
20+
21+
input_ = Input(shape=(T, D))
22+
# rnn = Bidirectional(LSTM(M, return_state=True, return_sequences=True))
23+
rnn = Bidirectional(LSTM(M, return_state=True, return_sequences=False))
24+
x = rnn(input_)
25+
26+
model = Model(inputs=input_, outputs=x)
27+
o, h1, c1, h2, c2 = model.predict(X)
28+
print("o:", o)
29+
print("o.shape:", o.shape)
30+
print("h1:", h1)
31+
print("c1:", c1)
32+
print("h2:", h2)
33+
print("c2:", c2)

nlp_class3/cnn_toxic.py

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
# https://deeplearningcourses.com/c/deep-learning-advanced-nlp
2+
from __future__ import print_function, division
3+
from builtins import range
4+
# Note: you may need to update your version of future
5+
# sudo pip install -U future
6+
7+
import os
8+
import sys
9+
import numpy as np
10+
import pandas as pd
11+
import matplotlib.pyplot as plt
12+
from keras.preprocessing.text import Tokenizer
13+
from keras.preprocessing.sequence import pad_sequences
14+
from keras.layers import Dense, Input, GlobalMaxPooling1D
15+
from keras.layers import Conv1D, MaxPooling1D, Embedding
16+
from keras.models import Model
17+
from sklearn.metrics import roc_auc_score
18+
19+
20+
# Download the data:
21+
# https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge
22+
# Download the word vectors:
23+
# http://nlp.stanford.edu/data/glove.6B.zip
24+
25+
26+
# some configuration
27+
MAX_SEQUENCE_LENGTH = 100
28+
MAX_VOCAB_SIZE = 20000
29+
EMBEDDING_DIM = 100
30+
VALIDATION_SPLIT = 0.2
31+
BATCH_SIZE = 128
32+
EPOCHS = 10
33+
34+
35+
36+
# load in pre-trained word vectors
37+
print('Loading word vectors...')
38+
word2vec = {}
39+
with open(os.path.join('../large_files/glove.6B/glove.6B.%sd.txt' % EMBEDDING_DIM)) as f:
40+
# is just a space-separated text file in the format:
41+
# word vec[0] vec[1] vec[2] ...
42+
for line in f:
43+
values = line.split()
44+
word = values[0]
45+
vec = np.asarray(values[1:], dtype='float32')
46+
word2vec[word] = vec
47+
print('Found %s word vectors.' % len(word2vec))
48+
49+
50+
51+
# prepare text samples and their labels
52+
print('Loading in comments...')
53+
54+
train = pd.read_csv("../large_files/toxic-comment/train.csv")
55+
sentences = train["comment_text"].fillna("DUMMY_VALUE").values
56+
possible_labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
57+
targets = train[possible_labels].values
58+
59+
print("max sequence length:", max(len(s) for s in sentences))
60+
print("min sequence length:", min(len(s) for s in sentences))
61+
s = sorted(len(s) for s in sentences)
62+
print("median sequence length:", s[len(s) // 2])
63+
64+
65+
66+
67+
# convert the sentences (strings) into integers
68+
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
69+
tokenizer.fit_on_texts(sentences)
70+
sequences = tokenizer.texts_to_sequences(sentences)
71+
# print("sequences:", sequences); exit()
72+
73+
74+
# get word -> integer mapping
75+
word2idx = tokenizer.word_index
76+
print('Found %s unique tokens.' % len(word2idx))
77+
78+
79+
# pad sequences so that we get a N x T matrix
80+
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
81+
print('Shape of data tensor:', data.shape)
82+
83+
84+
85+
# prepare embedding matrix
86+
print('Filling pre-trained embeddings...')
87+
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
88+
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
89+
for word, i in word2idx.items():
90+
if i < MAX_VOCAB_SIZE:
91+
embedding_vector = word2vec.get(word)
92+
if embedding_vector is not None:
93+
# words not found in embedding index will be all zeros.
94+
embedding_matrix[i] = embedding_vector
95+
96+
97+
98+
# load pre-trained word embeddings into an Embedding layer
99+
# note that we set trainable = False so as to keep the embeddings fixed
100+
embedding_layer = Embedding(
101+
num_words,
102+
EMBEDDING_DIM,
103+
weights=[embedding_matrix],
104+
input_length=MAX_SEQUENCE_LENGTH,
105+
trainable=False
106+
)
107+
108+
109+
print('Building model...')
110+
111+
# train a 1D convnet with global maxpooling
112+
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
113+
x = embedding_layer(input_)
114+
x = Conv1D(128, 3, activation='relu')(x)
115+
x = MaxPooling1D(3)(x)
116+
x = Conv1D(128, 3, activation='relu')(x)
117+
x = MaxPooling1D(3)(x)
118+
x = Conv1D(128, 3, activation='relu')(x)
119+
x = GlobalMaxPooling1D()(x)
120+
x = Dense(128, activation='relu')(x)
121+
output = Dense(len(possible_labels), activation='sigmoid')(x)
122+
123+
model = Model(input_, output)
124+
model.compile(
125+
loss='binary_crossentropy',
126+
optimizer='rmsprop',
127+
metrics=['accuracy']
128+
)
129+
130+
print('Training model...')
131+
r = model.fit(
132+
data,
133+
targets,
134+
batch_size=BATCH_SIZE,
135+
epochs=EPOCHS,
136+
validation_split=VALIDATION_SPLIT
137+
)
138+
139+
140+
# plot some data
141+
plt.plot(r.history['loss'], label='loss')
142+
plt.plot(r.history['val_loss'], label='val_loss')
143+
plt.legend()
144+
plt.show()
145+
146+
# accuracies
147+
plt.plot(r.history['acc'], label='acc')
148+
plt.plot(r.history['val_acc'], label='val_acc')
149+
plt.legend()
150+
plt.show()
151+
152+
# plot the mean AUC over each label
153+
p = model.predict(data)
154+
aucs = []
155+
for j in range(6):
156+
auc = roc_auc_score(targets[:,j], p[:,j])
157+
aucs.append(auc)
158+
print(np.mean(aucs))

nlp_class3/extra_reading.txt

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
https://deeplearningcourses.com/c/deep-learning-advanced-nlp
2+
3+
Bidirectional Recurrent Neural Networks
4+
https://maxwell.ict.griffith.edu.au/spl/publications/papers/ieeesp97_schuster.pdf
5+
6+
Translation Modeling with Bidirectional Recurrent Neural Networks
7+
http://emnlp2014.org/papers/pdf/EMNLP2014003.pdf
8+
9+
Sequence to Sequence Learning with Neural Networks
10+
https://arxiv.org/abs/1409.3215
11+
12+
A Neural Conversational Model
13+
https://arxiv.org/abs/1506.05869v3
14+
15+
Neural Machine Translation by Jointly Learning to Align and Translate (Attention)
16+
https://arxiv.org/abs/1409.0473
17+
18+
Feed-Forward Networks with Attention Can Solve Some Long-Term Memory Problems (Simplified Attention)
19+
https://arxiv.org/abs/1512.08756
20+
21+
Memory Networks
22+
https://arxiv.org/abs/1410.3916
23+
24+
Towards AI-Complete Question Answering: A Set of Prerequisite Toy Tasks
25+
http://arxiv.org/abs/1502.05698
26+
27+
End-To-End Memory Networks
28+
http://arxiv.org/abs/1503.08895
29+
30+
Ask Me Anything: Dynamic Memory Networks for Natural Language Processing
31+
https://arxiv.org/abs/1506.07285
32+
33+
WaveNet
34+
https://deepmind.com/blog/wavenet-generative-model-raw-audio/
35+
36+
Tacotron
37+
https://google.github.io/tacotron/
38+
39+
Tacotron 2
40+
https://research.googleblog.com/2017/12/tacotron-2-generating-human-like-speech.html
41+
42+
An Empirical Evaluation of Generic Convolutional and Recurrent Networks for Sequence Modeling
43+
https://arxiv.org/abs/1803.01271
44+
(just released March 2018!)

0 commit comments

Comments
 (0)