20200501 fbsync (#750)

cpuhrsch · web-flow · commit fd764ad02968 · 2020-05-04T11:20:38.000-04:00
diff --git a/.python3 b/.python3
diff --git a/test/test_vocab.py b/test/test_vocab.py
@@ -132,7 +132,7 @@ def test_vocab_download_fasttext_vectors(self):
     def test_vocab_extend(self):
         c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
         # Build a vocab and get vectors twice to test caching.
-        for i in range(2):
+        for _ in range(2):
             f = FastText(language='simple')
             v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'],
                             vectors=f)
@@ -163,7 +163,7 @@ def test_vocab_extend(self):
     def test_vocab_download_custom_vectors(self):
         c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
         # Build a vocab and get vectors twice to test caching.
-        for i in range(2):
+        for _ in range(2):
             v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'],
                             vectors=Vectors('wiki.simple.vec',
                                             url=FastText.url_base.format('simple')))
diff --git a/torchtext/data/utils.py b/torchtext/data/utils.py
@@ -6,7 +6,7 @@
 from functools import partial
 
 
-def _split_tokenizer(x):
+def _split_tokenizer(x):  # noqa: F821
     # type: (str) -> List[str]
     return x.split()
 
diff --git a/torchtext/experimental/datasets/raw/text_classification.py b/torchtext/experimental/datasets/raw/text_classification.py
@@ -1,7 +1,6 @@
 import torch
 import io
 from torchtext.utils import download_from_url, extract_archive, unicode_csv_reader
-import sys
 
 URLS = {
     'AG_NEWS':
@@ -55,15 +54,9 @@ def __iter__(self):
             self.setup_iter()
 
         for i, item in enumerate(self._iterator):
-            if i == self.start:
-                break
-
-        num_lines = self.num_lines if self.num_lines is not None else sys.maxsize
-        for _ in range(num_lines):
-            yield item
-            try:
-                item = next(self._iterator)
-            except StopIteration:
+            if i >= self.start:
+                yield item
+            if self.num_lines is not None and i == (self.start + self.num_lines):
                 break
 
     def get_iterator(self):
diff --git a/torchtext/experimental/datasets/text_classification.py b/torchtext/experimental/datasets/text_classification.py
@@ -27,7 +27,7 @@ def _forward(token_list):
 
 def build_vocab(data, transforms):
     tok_list = []
-    for (label, txt) in data:
+    for _, txt in data:
         tok_list.append(transforms(txt))
     return build_vocab_from_iterator(tok_list)
 
diff --git a/torchtext/vocab.py b/torchtext/vocab.py
@@ -31,7 +31,7 @@ class Vocab(object):
     # TODO (@mttk): Populate classs with default values of special symbols
     UNK = '<unk>'
 
-    def __init__(self, counter, max_size=None, min_freq=1, specials=['<unk>', '<pad>'],
+    def __init__(self, counter, max_size=None, min_freq=1, specials=('<unk>', '<pad>'),
                  vectors=None, unk_init=None, vectors_cache=None, specials_first=True):
         """Create a Vocab object from a collections.Counter.
 
@@ -218,7 +218,7 @@ def set_vectors(self, stoi, vectors, dim, unk_init=torch.Tensor.zero_):
 
 class SubwordVocab(Vocab):
 
-    def __init__(self, counter, max_size=None, specials=['<pad>'],
+    def __init__(self, counter, max_size=None, specials=('<pad>'),
                  vectors=None, unk_init=torch.Tensor.zero_):
         """Create a revtok subword vocabulary from a collections.Counter.