Added 2 Text Summarizer

vybhav72954 · vybhav72954 · commit 43b588d1ec44 · 2021-01-25T23:20:58.000+05:30
diff --git a/Python/Text_Summary/Lex_Rank/README.md b/Python/Text_Summary/Lex_Rank/README.md
@@ -0,0 +1,36 @@
+# Lex_Rank
+Lex Rank approach for text summarization.
+
+
+## Dependencies
+
+- sumy
+- spacy
+- neologdn
+    * _This requires requires C++11 compiler_. CLick [here](https://pypi.org/project/neologdn/)
+    for documentation and [here](https://nuwen.net/mingw.html#install) 
+    for the C++11 compiler I use.
+
+## NLTK models
+
+- `en_core_web_sm`: A spaCy english multi-task CNN trained on OntoNotes.
+- `punkt`: NLP sentence tokenizer
+
+## Setup
+
+- Setup a `python 3.x` virtual environment.
+- `Activate` the environment
+- Install the dependencies using ```pip3 install -r requiremnts.txt```
+    * Install C++ compiler if `neologdn` is triggering `wheel` errors.
+- Setup the models by running the following commands,
+
+```bash
+$ python -m spacy download en_core_web_sm
+$ python -c "import nltk; nltk.download('punkt')"
+```
+- Run the `main.py` file
+- Enter the source path.
+
+## Results
+
+Results can be found [here](../assets).
diff --git a/Python/Text_Summary/Lex_Rank/main.py b/Python/Text_Summary/Lex_Rank/main.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# Import from summary_make.py
+from summary_make import summarize_sentences
+
+
+def main():
+    """
+    Main function, wrapper around summary_make
+    """
+    filepath = input("Enter the Source File: ")
+    with open(filepath, encoding='utf-8') as f:
+        sentences = f.readlines()
+    sentences = ' '.join(sentences)
+
+    summary = summarize_sentences(sentences)
+
+    filepath_index = filepath.find('.txt')
+    outputpath = filepath[:filepath_index] + '_lexRank.txt'
+
+    with open(outputpath, 'w') as w:
+        for sentence in summary:
+            w.write(str(sentence) + '\n')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Python/Text_Summary/Lex_Rank/preprocessing.py b/Python/Text_Summary/Lex_Rank/preprocessing.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+
+# Import
+import spacy
+import neologdn
+
+
+class EnglishCorpus:
+    """
+    A Class for for retaining the structure of text file as a corpus.
+
+    ...
+
+    Methods:
+        preprocessing(text:str)
+            Remove Special Characters and whitespaces
+
+        make_sentence_list(sentences:str)
+            Break sentence into a list of sentence suing NLP
+
+        make_corpus()
+            Generates the corpus in Morphological order
+    """
+
+    # Preparation of morphological analyzer
+    def __init__(self):
+        """
+        Constructor to initialize spaCy English model (See README)
+        """
+        self.nlp = spacy.load("en_core_web_sm")
+
+    # Pre-processing of line breaks and special characters
+    def preprocessing(self, text: str) -> str:
+        """
+        Removes white spaces and special characters.
+        Generates a set of sentences.
+        :param text: String of text to ge processed
+        :return: Sentence without white space and special characters
+        """
+        text = text.replace("\n", "")
+        text = neologdn.normalize(text)
+
+        return text
+
+    # Divide sentences into sentences while retaining the results of morphological analysis
+    def make_sentence_list(self, sentences: str) -> list:
+        """
+        Retains Morphological analysis and divides sentences in list of sentences.
+        Using Natural Language Processing
+        :param sentences: Sentences with morphological meaning
+        :return: List of sentence
+        """
+        doc = self.nlp(sentences)
+        self.ginza_sents_object = doc.sents
+        sentence_list = [s for s in doc.sents]
+
+        return sentence_list
+
+    # Put a space between words
+    def make_corpus(self) -> list:
+        """
+        Puts the white spaces between words
+        Generates Corpus
+        :return: Corpus for Tokenizing
+        """
+        corpus = []
+        for s in self.ginza_sents_object:
+            tokens = [str(t) for t in s]
+            corpus.append(" ".join(tokens))
+
+        return corpus
diff --git a/Python/Text_Summary/Lex_Rank/requirements.txt b/Python/Text_Summary/Lex_Rank/requirements.txt
@@ -0,0 +1,3 @@
+sumy==0.8.1
+spacy==2.3.2
+neologdn==0.4
diff --git a/Python/Text_Summary/Lex_Rank/summary_make.py b/Python/Text_Summary/Lex_Rank/summary_make.py
@@ -0,0 +1,34 @@
+from preprocessing import EnglishCorpus
+
+from sumy.parsers.plaintext import PlaintextParser
+from sumy.nlp.tokenizers import Tokenizer
+from sumy.utils import get_stop_words
+from sumy.summarizers.lex_rank import LexRankSummarizer
+
+
+def summarize_sentences(sentences: str, language="english") -> list:
+    """
+    Prepares the summary of sentences.
+    Calls preprocessing for generating a list of processed sentences.
+    Uses LexRank Summarization for preparing summary.
+    :param sentences: Sentences form the text file
+    :param language: Language used, default=English
+    :return: Summary of the source file
+    """
+    # Preparation sentences
+    corpus_maker = EnglishCorpus()
+    preprocessed_sentences = corpus_maker.preprocessing(sentences)
+    preprocessed_sentence_list = corpus_maker.make_sentence_list(preprocessed_sentences)
+    corpus = corpus_maker.make_corpus()
+    parser = PlaintextParser.from_string(" ".join(corpus), Tokenizer(language))
+
+    # Using Rank system for tokenizing the Headwords
+    summarizer = LexRankSummarizer()
+
+    # Generating stopwords, i.e. words which are not affecting the context of the text.
+    summarizer.stop_words = get_stop_words(language)
+
+    # Limiting the summary to one-fifth of the article (See README)
+    summary = summarizer(document=parser.document, sentences_count=len(corpus) * 2 // 10)
+
+    return summary
diff --git a/Python/Text_Summary/README.md b/Python/Text_Summary/README.md
@@ -0,0 +1,77 @@
+# Test_Summary
+
+[![forthebadge made-with-python](http://ForTheBadge.com/images/badges/made-with-python.svg)](https://www.python.org/)
+
+Text Summarization is an advanced project and comes under the umbreall of Natural Language Processing.
+There are multiple methods people use in order to summarize text.
+
+they can be affectively clubbed under 2 methods:
+
+- Abstractive: Understand the true context of text before summarization (like a human).
+- Extractive: Rank the text within the file and identify the impactful terms.
+
+While both these approaches are under reasearch, extrcative summarization is presently sed acroos multiple latfomrs.
+There are multiple methods by which text is summarized under extractive approach as well.
+
+In this script we will use the 2 important approach __Lex__ & __Text__, and will discuss their pros and cons.
+click 
+[here](https://en.wikipedia.org/wiki/Automatic_summarization#:~:text=The%20edges%20between%20sentences%20are,by%20the%20sentences'%20lengths) 
+for more info.
+
+Both the script uses datasets from Natural Language Processing.
+
+## Structure
+
+- [Lex Rank](Lex%20Rank) contains the necessary files for Lex Ranking approach.
+- [Text Rank](Text%20Rank) contains the necessary files for Text Ranking approach.
+- [Assets](assets) contains the the text files.
+
+## Instructions
+
+Detailed set of instructions can be found in respective directories.
+
+## Author(s)
+
+Made by [Vybhav Chaturvedi](https://www.linkedin.com/in/vybhav-chaturvedi-0ba82614a/)
+
+
+## Setup instructions
+
+- Setup a `python 3.x` virtual environment.
+- `Activate` the environment
+- Install the dependencies using ```pip3 install -r requiremnts.txt```
+- You are all set and the [script](../../AutoStyler/Scripts/text_extract.py) is Ready to run.
+- Carefully follow the Instructions.
+
+## Further Readings
+
+Some newcomers for the first time struggle with Tesseract, this is a direct link to the [installer](https://github.com/UB-Mannheim/tesseract/wiki)
+
+Setting up OCR can be found [here](https://stackoverflow.com/questions/50951955/pytesseract-tesseractnotfound-error-tesseract-is-not-installed-or-its-not-i)
+
+## Usage
+
+Just make sure that Tessaract is in proper directory, run the code according the comments and guidelines.
+
+```
+Smaple - 
+Enter the Folder name containing Images: <Name of Folder>
+Enter your desired output location: <Name of Folder>
+```
+
+## Output
+
+Output
+
+![Output](assets/Output.PNG)
+
+Image containing Text
+
+![Before Compression](assets/Sample.PNG)
+
+After Extraction
+
+![After Backup](assets/TextFile.PNG)
+
+
+
diff --git a/Python/Text_Summary/Text_Rank/README.md b/Python/Text_Summary/Text_Rank/README.md
@@ -0,0 +1,32 @@
+# Text_Rank
+Text Rank approach for text summarization.
+
+## Dependencies
+
+- nltk
+- numpy
+- networkx
+
+## NLTK models
+
+- `stopwords`: Stopwords are the English words which does not add much meaning to a sentence.
+
+## Setup
+
+- Setup a `python 3.x` virtual environment.
+- `Activate` the environment
+- Install the dependencies using ```pip3 install -r requiremnts.txt```
+- Setup the models by running the following commands,
+
+```bash
+$ python -m nltk.downloader stopwords
+```
+- Run the `text_summary.py` file
+- Enter the source path.
+
+## Results
+
+The code generates the tokens (same as weights) of set of words, it shows the relative importance of words according to 
+the summarizer, just uncomment the _l112_
+
+Results can be found [here](../assets).
diff --git a/Python/Text_Summary/Text_Rank/requirements.txt b/Python/Text_Summary/Text_Rank/requirements.txt
@@ -0,0 +1,3 @@
+nltk==3.2.4
+numpy==1.19.5
+networkx==2.5
diff --git a/Python/Text_Summary/Text_Rank/text_summary.py b/Python/Text_Summary/Text_Rank/text_summary.py
diff --git a/Python/Text_Summary/assets/random.txt b/Python/Text_Summary/assets/random.txt
diff --git a/Python/Text_Summary/assets/random_lexRank.txt b/Python/Text_Summary/assets/random_lexRank.txt
diff --git a/Python/Text_Summary/assets/random_textRank.txt b/Python/Text_Summary/assets/random_textRank.txt

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+sumy==0.8.1`
	`2`	`+spacy==2.3.2`
	`3`	`+neologdn==0.4`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+nltk==3.2.4`
	`2`	`+numpy==1.19.5`
	`3`	`+networkx==2.5`