smallsmallwood
diff --git a/‎README.md
Lines changed: 10 additions & 5 deletions b/‎README.md
Lines changed: 10 additions & 5 deletions
diff --git a/‎build_dataset.py renamed to ‎build_datasets.py
Lines changed: 7 additions & 7 deletions b/‎build_dataset.py renamed to ‎build_datasets.py
Lines changed: 7 additions & 7 deletions
diff --git a/‎clustering_learning.py
Lines changed: 22 additions & 21 deletions b/‎clustering_learning.py
Lines changed: 22 additions & 21 deletions
diff --git a/‎clustering_preprocessing.py
Lines changed: 8 additions & 8 deletions b/‎clustering_preprocessing.py
Lines changed: 8 additions & 8 deletions
diff --git a/‎dataset.py renamed to ‎datasets.py
Lines changed: 20 additions & 8 deletions b/‎dataset.py renamed to ‎datasets.py
Lines changed: 20 additions & 8 deletions
diff --git a/‎directories.py
Lines changed: 14 additions & 31 deletions b/‎directories.py
Lines changed: 14 additions & 31 deletions
@@ -1,7 +1,7 @@
 # Coreference Resolution with Deep Learning
 
 This repository contains code for training and running the neural coreference models decribed in two papers:
-* [Coming Soon] ["Deep Reinforcement Learning for Mention-Ranking Coreference Models"](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf), Kevin Clark and Christopher D. Manning, EMNLP 2016.
+* ["Deep Reinforcement Learning for Mention-Ranking Coreference Models"](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf), Kevin Clark and Christopher D. Manning, EMNLP 2016.
 * ["Improving Coreference Resolution by Learning Entity-Level Distributed Representations"](http://cs.stanford.edu/people/kevclark/resources/clark-manning-acl16-improving.pdf), Kevin Clark and Christopher D. Manning, ACL 2016.
 
 ### Requirements
@@ -13,9 +13,14 @@ The easiest way of doing this is within Stanford's [CoreNLP](https://github.com/
 ```
 java -Xmx5g -cp stanford-corenlp.jar edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse,mention,coref -coref.algorithm neural -file example_file.txt
 ```
-You will need to fork the latest version from github and download the latest models from [here](http://nlp.stanford.edu/software/stanford-english-corenlp-models-current.jar).
+See the [CorefAnnotator](http://stanfordnlp.github.io/CoreNLP/coref.html) page for more details.
+
 
 #### Training your own model
-1. Download pretrained word embeddings. We use 50 dimensional word2vec embeddings for English ([link](https://drive.google.com/open?id=0B5Y5rz_RUKRmdEFPcGIwZ2xLRW8)) and 64 dimenensional [polyglot](https://sites.google.com/site/rmyeid/projects/polyglot) embeddings for Chinese ([link](http://bit.ly/19bTKeS)) in our paper.
-2. Run the [NeuralCorefDataExporter](https://github.com/stanfordnlp/CoreNLP/blob/master/src/edu/stanford/nlp/coref/neural/NeuralCorefDataExporter.java) class in the development version Stanford's CoreNLP using [this](https://github.com/stanfordnlp/CoreNLP/blob/master/src/edu/stanford/nlp/coref/neural/properties/english-conll.properties) properties file. This does mention detection and feature extraction on the CoNLL data and then outputs the results as json.
-3. Run run_all.py, preferably on a GPU. Training takes roughly 4 days on a GTX TITAN GPU.
+The following to trains the neural mention-ranking model with reward rescaling (the highest scoring model from the papers).
+1. Download the CoNLL training data from [here](http://conll.cemantix.org/2012/data.html).
+2. Download pretrained word embeddings. We use 50 dimensional word2vec embeddings for English ([link](https://drive.google.com/open?id=0B5Y5rz_RUKRmdEFPcGIwZ2xLRW8)) and 64 dimenensional [polyglot](https://sites.google.com/site/rmyeid/projects/polyglot) embeddings for Chinese ([link](http://bit.ly/19bTKeS)) in our paper.
+3. Run the [NeuralCorefDataExporter](https://github.com/stanfordnlp/CoreNLP/blob/master/src/edu/stanford/nlp/coref/neural/NeuralCorefDataExporter.java) class in version Stanford's CoreNLP using the [neural-coref-conll](https://github.com/stanfordnlp/CoreNLP/blob/master/src/edu/stanford/nlp/coref/properties/neural-english-conll.properties) properties file. This does mention detection and feature extraction on the CoNLL data and then outputs the results as json.
+4. Run run_all.py, preferably on a GPU. Training takes roughly 7 days on a GTX TITAN X GPU.
+
+run_all.py also contains methods to train the other models from the papers.
@@ -1,6 +1,6 @@
 import directories
-import util
-from dataset import PairDataBuilder, MentionDataBuilder, DocumentDataBuilder
+import utils
+from datasets import PairDataBuilder, MentionDataBuilder, DocumentDataBuilder
 from word_vectors import WordVectors
 import random
 import numpy as np
@@ -9,7 +9,7 @@
 def explore_pairwise_features():
     pos_sum, neg_sum = np.zeros(9), np.zeros(9)
     pos_count, neg_count = 0, 0
-    for i, d in enumerate(util.load_json_lines(directories.RAW + "train")):
+    for i, d in enumerate(utils.load_json_lines(directories.RAW + "train")):
         for key in d["labels"].keys():
             if d["labels"][key] == 1:
                 pos_sum += d["pair_features"][key]
@@ -25,7 +25,7 @@ def explore_pairwise_features():
 
 
 def build_dataset(vectors, name, tune_fraction=0.0, reduced=False, columns=None):
-    doc_vectors = util.load_pickle(directories.MISC + name.replace("_reduced", "") +
+    doc_vectors = utils.load_pickle(directories.MISC + name.replace("_reduced", "") +
                                    "_document_vectors.pkl")
 
     main_pairs = PairDataBuilder(columns)
@@ -35,9 +35,9 @@ def build_dataset(vectors, name, tune_fraction=0.0, reduced=False, columns=None)
     main_docs = DocumentDataBuilder(columns)
     tune_docs = DocumentDataBuilder(columns)
 
-    print "Building dataset", name
-    p = util.Progbar(target=(2 if reduced else util.lines_in_file(directories.RAW + name)))
-    for i, d in enumerate(util.load_json_lines(directories.RAW + name)):
+    print "Building dataset", name + ("/tune" if tune_fraction > 0 else "")
+    p = utils.Progbar(target=(2 if reduced else utils.lines_in_file(directories.RAW + name)))
+    for i, d in enumerate(utils.load_json_lines(directories.RAW + name)):
         if reduced and i > 2:
             break
         p.update(i + 1)
 
@@ -1,9 +1,9 @@
-import dataset
+import datasets
 import clustering_models
 import model_properties
 import directories
 import timer
-import util
+import utils
 import evaluation
 from document import Document
 from clustering_preprocessing import ActionSpace
@@ -118,7 +118,7 @@ def train_all(self):
         timer.start("train")
 
         model_weights = self.model.get_weights()
-        prog = util.Progbar(len(self.memory))
+        prog = utils.Progbar(len(self.memory))
         random.shuffle(self.memory)
         for i, X in enumerate(self.memory):
             loss = self.train_on_example(X)
@@ -176,7 +176,7 @@ def __init__(self, trainer, docs, data, message, replay_memory=None, beta=0,
         random.shuffle(docs)
         if self.training:
             docs = docs[:docs_per_iteration]
-        prog = util.Progbar(len(docs))
+        prog = utils.Progbar(len(docs))
         for i, (doc, actionstate) in enumerate(docs):
             self.trainer.doc = doc
             self.trainer.actionstate = actionstate
@@ -243,27 +243,28 @@ def evaluate(trainer, docs, data, message):
 
 
 def load_docs(dataset_name, word_vectors):
-    return (dataset.Dataset(dataset_name, model_properties.MentionRankingProps(), word_vectors),
-            zip(util.load_pickle(directories.DOCUMENTS + dataset_name + '_docs.pkl'),
-                util.load_pickle(directories.ACTION_SPACE + dataset_name + '_action_space.pkl')))
+    return (datasets.Dataset(dataset_name, model_properties.MentionRankingProps(), word_vectors),
+            zip(utils.load_pickle(directories.DOCUMENTS + dataset_name + '_docs.pkl'),
+                utils.load_pickle(directories.ACTION_SPACE + dataset_name + '_action_space.pkl')))
 
 
 class Trainer:
     def __init__(self, model_props, train_set='train', test_set='dev', n_epochs=200,
                  empty_buffer=True, betas=None, write_every=1, max_docs=10000):
+        self.model_props = model_props
         if betas is None:
-            betas = [0]
+            betas = [0.8 ** i for i in range(1, 5)]
         self.write_every = write_every
 
-        print "Model=" + directories.CLUSTERER + ", ordering from " + directories.ACTION_SPACE
+        print "Model=" + model_props.path + ", ordering from " + directories.ACTION_SPACE
         self.pair_model, self.anaphoricity_model, self.model, word_vectors = \
             clustering_models.get_models(model_props)
         json_string = self.model.to_json()
-        open(directories.CLUSTERER + 'architecture.json', 'w').write(json_string)
-        util.rmkdir(directories.CLUSTERER + 'src')
+        open(model_props.path + 'architecture.json', 'w').write(json_string)
+        utils.rmkdir(model_props.path + 'src')
         for fname in os.listdir('.'):
             if fname.endswith('.py'):
-                shutil.copyfile(fname, directories.CLUSTERER + 'src/' + fname)
+                shutil.copyfile(fname, model_props.path + 'src/' + fname)
 
         self.train_data, self.train_docs = load_docs(train_set, word_vectors)
         print "Train loaded"
@@ -290,7 +291,7 @@ def __init__(self, model_props, train_set='train', test_set='dev', n_epochs=200,
         replay_memory = ReplayMemory(self, self.model)
         for self.epoch in range(n_epochs):
             print 80 * "-"
-            print "ITERATION", (self.epoch + 1), "model =", directories.CLUSTERER
+            print "ITERATION", (self.epoch + 1), "model =", model_props.path
             ar = AgentRunner(self, self.train_docs, self.train_data, "Training", replay_memory,
                              beta=0 if self.epoch >= len(betas) else betas[self.epoch])
             self.train_pairs = ar.merged_pairs
@@ -312,7 +313,7 @@ def run_evaluation(self):
         epoch_stats.update({"train " + k: v for k, v in train_scores.iteritems()})
         epoch_stats.update({"test " + k: v for k, v in test_scores.iteritems()})
         self.history.append(epoch_stats)
-        util.write_pickle(self.history, directories.CLUSTERER + 'history.pkl')
+        utils.write_pickle(self.history, self.model_props.path + 'history.pkl')
         timer.print_totals()
 
         test_conll = epoch_stats["test conll"]
@@ -327,17 +328,17 @@ def run_evaluation(self):
             print "New best CoNLL in window, saving model"
             self.save_progress(dev_pairs, test_pairs,
                                str(self.write_every * int(self.epoch / self.write_every)))
-        self.model.save_weights(directories.CLUSTERER + "weights.hdf5", overwrite=True)
+        self.model.save_weights(self.model_props.path + "weights.hdf5", overwrite=True)
 
     def save_progress(self, dev_pairs, test_pairs, prefix):
-        self.model.save_weights(directories.CLUSTERER + prefix + "_weights.hdf5", overwrite=True)
-        write_pairs(dev_pairs, prefix + "_dev_pairs")
-        write_pairs(test_pairs, prefix + "_test_pairs")
-        write_pairs(self.train_pairs, prefix + "_train_pairs")
+        self.model.save_weights(self.model_props.path + prefix + "_weights.hdf5", overwrite=True)
+        write_pairs(dev_pairs, self.model_props.path + prefix + "_dev_pairs")
+        write_pairs(test_pairs, self.model_props.path + prefix + "_test_pairs")
+        write_pairs(self.train_pairs, self.model_props.path + prefix + "_train_pairs")
 
 
-def write_pairs(pairs, name):
-    with open(directories.CLUSTERER + name, 'w') as f:
+def write_pairs(pairs, path):
+    with open(path, 'w') as f:
         for did, doc_merged_pairs in pairs.iteritems():
             f.write(str(did) + "\t")
             for m1, m2 in doc_merged_pairs:
 
@@ -1,4 +1,4 @@
-import util
+import utils
 import directories
 import shutil
 import timer
@@ -91,7 +91,7 @@ def write_probable_pairs(dataset_name, action_space_path, scores):
     margin_removals = 0
     total_pairs = 0
     total_size = 0
-    for did in util.logged_loop(scores):
+    for did in utils.logged_loop(scores):
         doc_scores = scores[did]
         pairs = sorted([pair for pair in doc_scores.keys() if pair[0] != -1],
                        key=lambda pr: doc_scores[pr] - (-1 - 0.3*doc_scores[(-1, pr[1])]),
@@ -121,17 +121,17 @@ def write_probable_pairs(dataset_name, action_space_path, scores):
     print "avg size without filter: {:.1f}".format(total_pairs / float(len(scores)))
     print "avg size: {:.1f}".format(total_size / float(len(scores)))
     print "margin removals size: {:.1f}".format(margin_removals / float(len(scores)))
-    util.write_pickle(probable_pairs, action_space_path + dataset_name + '_probable_pairs.pkl')
+    utils.write_pickle(probable_pairs, action_space_path + dataset_name + '_probable_pairs.pkl')
     shutil.copyfile('clustering_preprocessing.py',
                     action_space_path + 'clustering_preprocessing.py')
 
 
 def write_action_spaces(dataset_name, action_space_path, model_path, ltr=False):
     output_file = action_space_path + dataset_name + "_action_space.pkl"
     print "Writing candidate actions to " + output_file
-    scores = util.load_pickle(model_path + dataset_name + "_scores.pkl")
+    scores = utils.load_pickle(model_path + dataset_name + "_scores.pkl")
     write_probable_pairs(dataset_name, action_space_path, scores)
-    probable_pairs = util.load_pickle(action_space_path + dataset_name + '_probable_pairs.pkl')
+    probable_pairs = utils.load_pickle(action_space_path + dataset_name + '_probable_pairs.pkl')
 
     possible_pairs_total = 0
     action_spaces = []
@@ -152,12 +152,12 @@ def write_action_spaces(dataset_name, action_space_path, model_path, ltr=False):
             possible_pairs = get_possible_pairs(probable_pairs[did])
             possible_pairs_total += len(possible_pairs)
             action_spaces.append(ActionSpace(did, actions, possible_pairs))
-    util.write_pickle(action_spaces, output_file)
+    utils.write_pickle(action_spaces, output_file)
 
 
 def main(ranking_model):
     write_action_spaces("dev", directories.ACTION_SPACE,
-                        directories.MODELS_BASE + ranking_model + "/")
+                        directories.MODELS + ranking_model + "/")
     write_action_spaces("test", directories.ACTION_SPACE,
-                        directories.MODELS_BASE + ranking_model + "/")
+                        directories.MODELS + ranking_model + "/")
 
@@ -1,5 +1,5 @@
 import timer
-import util
+import utils
 import directories
 import numpy as np
 
@@ -37,7 +37,7 @@ def write(self, path):
             self.data = np.array(self.data, dtype='bool') \
                 if self.name == 'y' or self.name == 'pf' else np.vstack(self.data)
             print "Writing {:}, dtype={:}, size={:}".format(self.name, str(self.data.dtype),
-                                                            util.sizeof_fmt(self.data.nbytes))
+                                                            utils.sizeof_fmt(self.data.nbytes))
             np.save(path + self.name, self.data)
 
 
@@ -47,7 +47,7 @@ def __init__(self, columns=None):
         self.mention_inds = DatasetColumn('dmi', columns)
         self.pair_inds = DatasetColumn('dpi', columns)
         self.features = DatasetColumn('df', columns)
-        self.genres = util.load_pickle(directories.MISC + 'genres.pkl')
+        self.genres = utils.load_pickle(directories.MISC + 'genres.pkl')
 
     def add_doc(self, ms, me, ps, pe, features):
         self.mention_inds.append(np.array([ms, me], dtype='int32'))
@@ -58,7 +58,7 @@ def add_doc(self, ms, me, ps, pe, features):
     def write(self, dataset_name):
         path = directories.DOC_DATA + dataset_name + '/'
         if not self.columns:
-            util.rmkdir(path)
+            utils.rmkdir(path)
         self.mention_inds.write(path)
         self.pair_inds.write(path)
         self.features.write(path)
@@ -115,7 +115,7 @@ def span_vector(start, end):
     def write(self, dataset_name):
         path = directories.MENTION_DATA + dataset_name + '/'
         if not self.columns:
-            util.rmkdir(path)
+            utils.rmkdir(path)
         self.words.write(path)
         self.spans.write(path)
         self.features.write(path)
@@ -158,7 +158,7 @@ def add_pair(self, y, i1, i2, did, mid1, mid2, features):
     def write(self, dataset_name):
         path = directories.PAIR_DATA + dataset_name + '/'
         if not self.columns:
-            util.rmkdir(path)
+            utils.rmkdir(path)
         self.pair_indices.write(path)
         self.pair_features.write(path)
         self.y.write(path)
@@ -174,6 +174,7 @@ def size(self):
 class Dataset:
     def __init__(self, dataset_name, model_props, word_vectors):
         self.model_props = model_props
+        self.name = dataset_name
         mentions_path = directories.MENTION_DATA + dataset_name + '/'
         pair_path = directories.PAIR_DATA + dataset_name + '/'
         docs_path = directories.DOC_DATA + dataset_name + '/'
@@ -257,6 +258,16 @@ def featurize_pairs(self, m1, m2, batch, did):
 
 
 class DocumentBatchedDataset:
+    """
+    Shuffling and then iterating through all mention pairs in the dataset has two problems:
+        1. We want to compute a representation for a mention (in our case by looking up some
+           word embeddings and applying a hidden layer) once for every pair of mentions instead of
+           once for every mention.
+        2. For mention-ranking models, all pairs involving the current candidate anaphor must be
+           in the same batch.
+    We deal with this by instead using each document as a batch, except for large documents, which
+    we split into chunks).
+    """
     def __init__(self, dataset_name, model_props, max_pairs=10000, with_ids=False):
         self.name = dataset_name
         self.model_props = model_props
@@ -300,7 +311,6 @@ def __init__(self, dataset_name, model_props, max_pairs=10000, with_ids=False):
                                             np.ones(ana, dtype='int32')
                                             for ana in range(0, me - ms)])
             self.pair_nums += [np.array(p) for p in zip(pair_antecedents, pair_anaphors)]
-
         self.pair_nums = np.vstack(self.pair_nums)
 
         self.doc_sizes = {}
@@ -419,8 +429,8 @@ def __init__(self, dataset_name, model_props, max_pairs=10000, with_ids=False):
 
                 min_anaphor = max_anaphor
                 min_pair = max_pair
-
         timer.stop("preprocess_dataset")
+
         self.n_batches = len(self.batches)
         self.pairs_per_batch = float(self.n_pairs) / self.n_batches
         self.anaphoric_anaphors_per_batch = float(self.n_anaphoric_anaphors) / self.n_batches
@@ -478,6 +488,8 @@ def __iter__(self):
                 X['ends'] = ends[:, np.newaxis]
                 X['costs'] = costs[:, np.newaxis]
                 X['y'] = np.zeros((starts.size, 1))
+                if self.model_props.use_rewards:
+                    X['cost_ptrs'] = costs
             else:
                 X['y'] = self.y[pairs][:, np.newaxis]
 
 
@@ -1,10 +1,10 @@
-import util
+import utils
 
-DATA = './data/'
+DATA = '/scr/kevclark/clean_english_conll/'#'./data/'
 
 RAW = DATA + 'raw/'
-MODELS_BASE = DATA + 'models/'
-CLUSTERER_BASE = DATA + 'clusterers/'
+MODELS = DATA + 'models/'
+CLUSTERERS = DATA + 'clusterers/'
 DOCUMENTS = DATA + 'documents/'
 ACTION_SPACES_BASE = DATA + 'action_spaces/'
 GOLD = DATA + 'gold/'
@@ -19,36 +19,19 @@
 PAIR_DATA = FEATURES_BASE + 'mention_pair_data/'
 DOC_DATA = FEATURES_BASE + 'doc_data/'
 
-MODEL_NAME = 'model/'
-MODEL = MODELS_BASE + MODEL_NAME
-
-CLUSTERER_NAME = 'clusterer/'
-CLUSTERER = CLUSTERER_BASE + CLUSTERER_NAME
-
 ACTION_SPACE_NAME = 'action_spaces/'
 ACTION_SPACE = ACTION_SPACES_BASE + ACTION_SPACE_NAME
 
 assert DATA[-1] == '/'
 assert ACTION_SPACE_NAME[-1] == '/'
-assert MODEL_NAME[-1] == '/'
-assert CLUSTERER[-1] == '/'
-
-util.mkdir(MISC)
-util.mkdir(FEATURES_BASE)
-util.mkdir(MENTION_DATA)
-util.mkdir(PAIR_DATA)
-util.mkdir(DOC_DATA)
-util.mkdir(MODELS_BASE)
-util.mkdir(CLUSTERER_BASE)
-util.mkdir(MODEL)
-util.mkdir(CLUSTERER)
-util.mkdir(DOCUMENTS)
-util.mkdir(ACTION_SPACES_BASE)
-util.mkdir(ACTION_SPACE)
-
 
-def set_model_name(model_name):
-    global MODEL_NAME, MODEL
-    MODEL_NAME = model_name + '/'
-    MODEL = MODELS_BASE + MODEL_NAME
-    util.mkdir(MODEL)
+utils.mkdir(MISC)
+utils.mkdir(FEATURES_BASE)
+utils.mkdir(MENTION_DATA)
+utils.mkdir(PAIR_DATA)
+utils.mkdir(DOC_DATA)
+utils.mkdir(MODELS)
+utils.mkdir(CLUSTERERS)
+utils.mkdir(DOCUMENTS)
+utils.mkdir(ACTION_SPACES_BASE)
+utils.mkdir(ACTION_SPACE)