kforcodeai
diff --git a/‎.github/workflows/classification.yml
Lines changed: 39 additions & 0 deletions b/‎.github/workflows/classification.yml
Lines changed: 39 additions & 0 deletions
diff --git a/‎classification/.gitignore
Lines changed: 136 additions & 0 deletions b/‎classification/.gitignore
Lines changed: 136 additions & 0 deletions
diff --git a/‎classification/Dockerfile
Lines changed: 21 additions & 0 deletions b/‎classification/Dockerfile
Lines changed: 21 additions & 0 deletions
diff --git a/‎classification/README.MD
Lines changed: 2 additions & 0 deletions b/‎classification/README.MD
Lines changed: 2 additions & 0 deletions
diff --git a/‎classification/main.py
Lines changed: 11 additions & 0 deletions b/‎classification/main.py
Lines changed: 11 additions & 0 deletions
diff --git a/‎classification/requirements.txt
Lines changed: 3 additions & 0 deletions b/‎classification/requirements.txt
Lines changed: 3 additions & 0 deletions
diff --git a/‎classification/src/__init__.py b/‎classification/src/__init__.py
diff --git a/‎classification/src/classifier.py
Lines changed: 81 additions & 0 deletions b/‎classification/src/classifier.py
Lines changed: 81 additions & 0 deletions
diff --git a/‎classification/src/config.py
Lines changed: 20 additions & 0 deletions b/‎classification/src/config.py
Lines changed: 20 additions & 0 deletions
@@ -0,0 +1,39 @@
+name: classification
+
+on:
+  pull_request: 
+    branches:
+      - main
+    paths:
+      - "classification/**"
+
+jobs:
+  classification:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: ./classification
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+      with:
+        ref: ${{ github.ref }}
+    - name: Build container
+      run: |
+        docker build --tag classification:latest .
+    - name: Configure AWS Credentials
+      uses: aws-actions/configure-aws-credentials@v1
+      with:
+        aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+        aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        aws-region: us-east-1
+    - name: Push2ECR
+      id: ecr
+      uses: jwalton/gh-ecr-push@v1
+      with:
+        access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+        secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        region: us-east-1
+        image: classification:latest
+    - name: Update lambda with image
+      run: aws lambda update-function-code --function-name  classification --image-uri 968911158010.dkr.ecr.us-east-1.amazonaws.com/classification:latest
@@ -0,0 +1,136 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+.aws-sam
+*.pyc
+.vscode
+.DS_store
+**.bin
+**.ipynb_checkpoints
@@ -0,0 +1,21 @@
+FROM amazon/aws-lambda-python
+
+ARG MODEL_DIR=./models
+
+ENV TRANSFORMERS_CACHE=$MODEL_DIR
+ENV TRANSFORMERS_VERBOSITY=error
+
+RUN yum -y install gcc-c++
+
+COPY requirements.txt requirements.txt
+RUN pip install torch==1.8+cpu -f https://download.pytorch.org/whl/torch_stable.html --no-cache-dir
+RUN pip install -r requirements.txt --no-cache-dir
+
+COPY ./ ./
+
+# Run test cases and this saves the transformer model in the container
+RUN pip install pytest --no-cache-dir && pytest tests -s -vv
+
+RUN chmod -R 0777 $MODEL_DIR
+
+CMD [ "main.lambda_handler"]
@@ -0,0 +1,2 @@
+## Classification service   
+Classification using AWS Lambda & Transformers
@@ -0,0 +1,11 @@
+from sklearn import pipeline
+from src.classifier import Classifier
+
+pipeline = Classifier()
+
+
+def lambda_handler(event, context):
+    try:
+        return pipeline(event)
+    except Exception as e:
+        raise
@@ -0,0 +1,3 @@
+transformers==4.*
+tqdm==4.*
+scikit-learn==0.24.*
@@ -0,0 +1,81 @@
+import warnings
+from functools import lru_cache
+
+warnings.filterwarnings("ignore")
+
+from tqdm import tqdm
+from transformers import (AutoConfig, AutoModelForSequenceClassification,
+                          AutoTokenizer, pipeline)
+
+from src import config, utils
+
+logger = utils.create_logger(project_name=config.PREDICTION_TYPE, level="INFO")
+
+class Classifier:
+    def __init__(self):
+        _ = self.get_sentiment_pipeline(model_name=config.DEFAULT_MODEL_NAME, tokenizer_name=config.DEFAULT_TOKENIZER_NAME) #warm up
+
+    @staticmethod
+    @lru_cache(maxsize=config.CACHE_MAXSIZE)
+    def get_sentiment_pipeline(model_name: str, tokenizer_name: str) -> pipeline:
+        """Sentiment pipeline for the given model and tokenizer
+
+        Args:
+            model_name (str): Indicating the name of the model
+            tokenizer_name (str): Indicating the name of the tokenizer
+
+        Returns:
+            pipeline: sentiment pipeline
+        """
+        logger.info(f"Loading model: {model_name}")
+        id2label = config.ID_SENTIMENT_MAPPING[model_name]
+        label2id = {label: idx for idx, label in id2label.items()}
+
+        model_config = AutoConfig.from_pretrained(model_name)
+        model_config.label2id = label2id
+        model_config.id2label = id2label
+        model = AutoModelForSequenceClassification.from_pretrained(
+            model_name, config=model_config
+        )
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        classification_pipeline = pipeline(
+            "sentiment-analysis", model=model, tokenizer=tokenizer
+        )
+        return classification_pipeline
+
+    def get_clean_text(self, text: str) -> str:
+        """Clean the text
+
+        Args:
+            text (str): text
+
+        Returns:
+            str: clean text
+        """        
+        return text.strip().lower()
+
+    def __call__(self, request: dict)-> dict:
+        """Predict the sentiment of the given texts
+        
+        Args:
+            request (dict): request containing the list of text to predict the sentiment 
+        
+        Returns:
+            dict: classes of the given text
+        """
+        texts = [self.get_clean_text(text) for text in request["texts"]]
+        model_name = request["model_name"]
+        tokenizer_name = request["tokenizer_name"]
+        
+        logger.info(f"Predicting sentiment for {len(texts)} texts")
+        classification_pipeline = self.get_sentiment_pipeline(model_name, tokenizer_name)
+
+        predictions = classification_pipeline(texts)
+        for i, pred in enumerate(predictions):
+            predictions[i]["score"] = round(pred["score"], 2)
+
+        return {
+            "predictions": predictions
+        }
+
+    
@@ -0,0 +1,20 @@
+PREDICTION_TYPE = 'classification'
+
+DEFAULT_MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment"
+DEFAULT_TOKENIZER_NAME = "roberta-base"
+ID_SENTIMENT_MAPPING = { # add for all models to be supported
+    "cardiffnlp/twitter-roberta-base-sentiment": {
+        0: "NEGATIVE",
+        1: "NEUTRAL",
+        2: "POSITIVE"
+    },
+    "cardiffnlp/twitter-roberta-base-emotion": {
+        0: "ANGER",
+        1: "JOY",
+        2: "OPTIMISM",
+        3: "SADNESS"
+    }
+}
+
+# cache
+CACHE_MAXSIZE = 4
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+## Classification service`
	`2`	`+Classification using AWS Lambda & Transformers`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+transformers==4.*`
	`2`	`+tqdm==4.*`
	`3`	`+scikit-learn==0.24.*`