diff --git a/example.py b/example.py index 41f292b..53e095f 100755 --- a/example.py +++ b/example.py @@ -1,11 +1,20 @@ import metapy def tokens_lowercase(doc): - #Write a token stream that tokenizes with ICUTokenizer (use the argument "suppress_tags=True"), + #Write a token stream that tokenizes with ICUTokenizer, #lowercases, removes words with less than 2 and more than 5 characters #performs stemming and creates trigrams (name the final call to ana.analyze as "trigrams") '''Place your code here''' + metapy.log_to_stderr() + tok = metapy.analyzers.ICUTokenizer(suppress_tags=True) + tok = metapy.analyzers.LowercaseFilter(tok) + tok = metapy.analyzers.LengthFilter(tok, min=2, max=5) + tok = metapy.analyzers.Porter2Filter(tok) + ana = metapy.analyzers.NGramWordAnalyzer(3, tok) + trigrams = ana.analyze(doc) + return trigrams + #leave the rest of the code as is tok.set_content(doc.content()) tokens, counts = [], []