Add Filtering and Aggregation Processors (#14)

TypingKoala · Johnny Bui · web-flow · commit 2c06f09c80ce · 2020-07-31T14:12:42.000-07:00
* Add initial implementations of filter and aggregate

* Add initial implementations of filter and aggregate

* Add tests for filtering and aggregation

* Add new processors to docs

* add support for filterbyindex on single-indices

* add additional test case for filterbyindex for single-indices

* Add improved examples for aggregators

* Increment version number and document changes

* improve docstrings of filter and aggregate functions

* add test to check that aggregate excludes non-numeric metrics

Co-authored-by: Johnny Bui &lt;me@johnnybui.com&gt;
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ Take a look at the notebooks below to demonstrate the functionality of FTPVL.
 1. [Using `HydraFetcher` and Processors](https://colab.research.google.com/drive/1BIQ-iulDFpzcve7lGJPwLePJ5ETBJ6Ut?usp=sharing)
 2. [Styling tables with `SingleTableVisualizer`](https://colab.research.google.com/drive/1u3EnmIYnTBk-LXZhqNHt_h4aMuq-_cWq?usp=sharing)
 3. [Comparing two different Evaluations](https://colab.research.google.com/drive/1I7InmA6210vIIwdQ7TGHE6aF_WwIm1dM?usp=sharing)
+4. [Filtering and Aggregating an Evaluation](https://colab.research.google.com/drive/1DDwlQFS81RGLL-q8DsgICF-HOC5ir6oS?usp=sharing)
 
 ## Documentation
 Extensive documentation, including a *Getting Started* guide, is available on
@@ -33,6 +34,7 @@ make html
 * `pandas`: for data management and processing ([website](https://pandas.pydata.org/))
 * `seaborn`: for colormap generation ([website](https://seaborn.pydata.org/))
 * `jinja2`: for visualization generation ([website](https://jinja.palletsprojects.com/))
+* `scipy`: for support of built-in aggregators([website](https://www.scipy.org/))
 
 ### Development Dependencies
 * `requests-mock`: for mocking request object for testing fetchers ([website](https://requests-mock.readthedocs.io/en/latest/))
@@ -44,6 +46,9 @@ make html
 * `sphinx-rtd-theme`: for documentation generation (theme) ([website](https://github.com/readthedocs/sphinx_rtd_theme))
 
 ## Changes
+### 0.1.6
+* Added support for filter and aggregator processors, fixes [#9](https://github.com/SymbiFlow/FPGA-Tool-Performance-Visualization-Library/issues/9)
+
 ### 0.1.5
 * Added support for custom projects and jobsets in HydraFetcher.
 
diff --git a/docs/topics/api.rst b/docs/topics/api.rst
@@ -67,6 +67,15 @@ Processors API
 .. autoclass:: ftpvl.processors.RelativeDiff
     :members:
 
+.. autoclass:: ftpvl.processors.FilterByIndex
+    :members:
+
+.. autoclass:: ftpvl.processors.Aggregate
+    :members:
+
+.. autoclass:: ftpvl.processors.GeomeanAggregate
+    :members:
+
 .. _topics-api-styles:
 
 Styles API
diff --git a/ftpvl/processors.py b/ftpvl/processors.py
@@ -1,9 +1,11 @@
 """ Processors transform Evaluations to be more useful when visualized. """
+import math
+from typing import Any, Callable, Dict, List, Union
 
-from typing import List, Dict
-import pandas as pd
 import numpy as np
+import pandas as pd
 from ftpvl.evaluation import Evaluation
+from scipy import stats
 
 
 class Processor:
@@ -438,3 +440,106 @@ def process(self, b: Evaluation) -> Evaluation:
         difference_eval = Evaluation(diff)
 
         return difference_eval
+
+class FilterByIndex(Processor):
+    """
+    Processor that filters an Evaluation by matching a specified index value
+    after indexing.
+
+    This is best used in a processing pipeline after the Reindex processor.
+    For filtering an evaluation based on metrics (which is not an index),
+    use the FilterByMetric processor.
+
+    Parameters
+    ----------
+    index_name : str
+        the name of the index to use when filtering
+    index_value : Any
+        the value to compare with
+    
+    Examples
+    --------
+    >>> a = Evaluation(pd.DataFrame(
+    ... data=[
+    ...     {"x": 1, "y": 5},
+    ...     {"x": 4, "y": 10}
+    ... ],
+    ... index=pd.Index(["a", "b"], name="key")))
+    >>> a.process([FilterByIndex("key", "a")]).get_df()
+        x    y
+    key
+    a   1    5
+    """
+    def __init__(self, index_name: str, index_value: Any):
+        self.index_name = index_name
+        self.index_value = index_value
+    
+    def process(self, input_eval: Evaluation):
+        old_df = input_eval.get_df()
+        if isinstance(old_df.index, pd.MultiIndex):
+            new_df = old_df.xs(self.index_value, level=self.index_name)
+        elif isinstance(old_df.index, pd.Index):
+            # slicing instead of indexing to maintain shape
+            new_df = old_df.loc[self.index_value:self.index_value]
+        else:
+            raise ValueError("Incompatible dataframe index.")
+        return Evaluation(new_df, input_eval.get_eval_id())
+
+class Aggregate(Processor):
+    """
+    Processor that allows you to aggregate all the numeric fields of an
+    Evaluation using a specified function.
+
+    This acts as a superclass for specific aggregator implementations, such as
+    GeomeanAggregate. It can also be used for custom aggregations, by supplying
+    an aggregator function to the constructor.
+
+    Parameters
+    ----------
+    func : Callable[[pd.Series], Union[int, float]]
+        a function that takes a Pandas Series and aggregates it into a single
+        number, possibly a NaN value
+
+    Examples
+    --------
+    >>> a = Evaluation(pd.DataFrame(
+    ... data=[
+    ...     {"x": 1, "y": 5},
+    ...     {"x": 4, "y": 10}
+    ... ]))
+    >>> a.process([Aggregate(lambda x: x.sum())]).get_df()
+        x    y
+    0   5    15
+    """
+    def __init__(self, func: Callable[[pd.Series], Union[int, float]]):
+        self.func = func
+    
+    def process(self, input_eval: Evaluation):
+        old_df = input_eval.get_df()
+        numeric_columns = old_df.select_dtypes(include=['number']).dropna(axis=1).columns
+        new_df = pd.DataFrame([old_df[numeric_columns].agg(self.func)])
+        return Evaluation(new_df, input_eval.get_eval_id())
+
+class GeomeanAggregate(Aggregate):
+    """
+    Processor that aggregates an entire Evaluation by finding the geometric mean of each
+    numeric metric.
+
+    Subclass of Aggregate class.
+
+    Examples
+    --------
+    >>> a = Evaluation(pd.DataFrame(
+    ... data=[
+    ...     {"x": 1, "y": 8},
+    ...     {"x": 4, "y": 8}
+    ... ]))
+    >>> a.process([GeomeanAggregate()).get_df()
+        x    y
+    0   2    8
+    """
+    def __init__(self):
+        def geomean(x):
+            x = x.dropna()
+            return stats.gmean(x) if not x.empty else math.nan
+        super().__init__(geomean)
diff --git a/requirements.txt b/requirements.txt
@@ -2,6 +2,7 @@ pandas
 requests-mock
 seaborn
 jinja2
+scipy
 
 pylint
 pytest
diff --git a/setup.py b/setup.py
@@ -18,7 +18,7 @@
 EMAIL = 'me@johnnybui.com'
 AUTHOR = 'Johnny Bui'
 REQUIRES_PYTHON = '>=3.6.0'
-VERSION = '0.1.5'
+VERSION = '0.1.6'
 
 # What packages are required for this module to be executed?
 REQUIRED = [
diff --git a/tests/test_processor.py b/tests/test_processor.py
@@ -5,18 +5,7 @@
     assert_frame_equal, assert_series_equal, assert_index_equal
 )
 
-from ftpvl.processors import (
-    AddNormalizedColumn,
-    CleanDuplicates,
-    MinusOne,
-    StandardizeTypes,
-    ExpandColumn,
-    Reindex,
-    SortIndex,
-    NormalizeAround,
-    Normalize,
-    RelativeDiff
-)
+from ftpvl.processors import *
 
 from ftpvl.evaluation import Evaluation
 
@@ -34,6 +23,9 @@ class TestProcessor:
     SortIndex()
     NormalizeAround()
     Normalize()
+    FilterByIndex()
+    Aggregate()
+    GeomeanAggregate()
     """
 
     def test_minusone(self):
@@ -505,3 +497,145 @@ def test_relativediff(self):
         )
 
         assert_frame_equal(expected, result)
+
+    def test_filterbyindex_multindex(self):
+        """ tests if filtering by index works for multi-index dataframe """
+        # test dataframe
+        # {"group": "a", "key": "a", "value": 10},
+        # {"group": "a", "key": "b", "value": 5},
+        # {"group": "a", "key": "c", "value": 3},
+        # {"group": "b", "key": "d", "value": 100},
+        # {"group": "b", "key": "e", "value": 31}
+
+        idx_arrays = [["a", "a", "a", "b", "b"], ["a", "b", "c", "d", "e"]]
+        index = pd.MultiIndex.from_arrays(idx_arrays, names=("group", "key"))
+        df = pd.DataFrame({"value": [10, 5, 3, 100, 31]}, index=index)
+        eval1 = Evaluation(df, eval_id=10)
+
+        # filter by first index
+        pipeline = [FilterByIndex("group", "a")]
+        result = eval1.process(pipeline)
+
+        expected_index = pd.Index(["a", "b", "c"], name="key")
+        expected_df = pd.DataFrame({"value": [10, 5, 3]}, index=expected_index)
+
+        assert_frame_equal(result.get_df(), expected_df)
+        assert result.get_eval_id() == 10
+
+        # filter by second index
+        pipeline = [FilterByIndex("key", "a")]
+        result = eval1.process(pipeline)
+
+        expected_index = pd.Index(["a"], name="group")
+        expected_df = pd.DataFrame({"value": [10]}, index=expected_index)
+
+        assert_frame_equal(result.get_df(), expected_df)
+        assert result.get_eval_id() == 10
+
+    def test_filterbyindex_singleindex(self):
+        """ tests if filtering by index works for single-index dataframe """
+        # test dataframe
+        # {"group": "a", "key": "a", "value": 10},
+        # {"group": "a", "key": "b", "value": 5},
+        # {"group": "a", "key": "c", "value": 3},
+        # {"group": "b", "key": "d", "value": 100},
+        # {"group": "b", "key": "e", "value": 31}
+
+        idx_array = ["a", "a", "a", "b", "b"]
+        index = pd.Index(idx_array, name="key")
+        df = pd.DataFrame({"value": [10, 5, 3, 100, 31]}, index=index)
+        eval1 = Evaluation(df, eval_id=10)
+
+        # filter by first index
+        pipeline = [FilterByIndex("key", "a")]
+        result = eval1.process(pipeline)
+        expected_index = pd.Index(["a", "a", "a"], name="key")
+        expected_df = pd.DataFrame({"value": [10, 5, 3]}, index=expected_index)
+
+        assert_frame_equal(result.get_df(), expected_df)
+        assert result.get_eval_id() == 10
+
+    def test_aggregate(self):
+        """ Test aggregate processor with custom aggregator functions """
+        df = pd.DataFrame(
+            [
+                {"a": 1, "b": 1, "c": 5},
+                {"a": 1, "b": 2, "c": 4},
+                {"a": 3, "b": 3, "c": 3},
+                {"a": 4, "b": 4, "c": 2},
+                {"a": 5, "b": 5, "c": 1},
+            ]
+        )
+        eval1 = Evaluation(df, eval_id=20)
+
+        pipeline = [Aggregate(lambda x: x.sum())]
+        result = eval1.process(pipeline)
+
+        expected_df = pd.DataFrame(
+            [
+                {"a": 14, "b": 15, "c": 15}
+            ]
+        )
+        assert_frame_equal(result.get_df(), expected_df)
+        assert eval1.get_eval_id() == 20
+
+        pipeline2 = [Aggregate(lambda x: x.product())]
+        result2 = eval1.process(pipeline2)
+
+        expected_df2 = pd.DataFrame(
+            [
+                {"a": 60, "b": 120, "c": 120}
+            ]
+        )
+        assert_frame_equal(result2.get_df(), expected_df2)
+        assert result2.get_eval_id() == 20
+
+    def test_aggregate_exclude_nonnumeric(self):
+        """ Check if aggregate processor excludes fields that are non-numeric """
+        df = pd.DataFrame(
+            [
+                {"a": 1, "b": 1, "c": "a"},
+                {"a": 1, "b": 2, "c": "b"},
+                {"a": 3, "b": 3, "c": "c"},
+                {"a": 4, "b": 4, "c": "d"},
+                {"a": 5, "b": 5, "c": "e"},
+            ]
+        )
+        eval1 = Evaluation(df, eval_id=20)
+
+        pipeline = [Aggregate(lambda x: x.sum())]
+        result = eval1.process(pipeline)
+
+        expected_df = pd.DataFrame(
+            [
+                {"a": 14, "b": 15}
+            ]
+        )
+        assert_frame_equal(result.get_df(), expected_df)
+        assert eval1.get_eval_id() == 20
+
+    def test_geomean_aggregate(self):
+        """ Test built-in geomean aggregator """
+        df = pd.DataFrame(
+            [
+                {"a": 1, "b": 1, "c": 5},
+                {"a": 1, "b": 2, "c": 4},
+                {"a": 3, "b": 3, "c": 3},
+                {"a": 4, "b": 4, "c": 2},
+                {"a": 5, "b": 5, "c": 1},
+            ]
+        )
+        eval1 = Evaluation(df, eval_id=20)
+
+        pipeline = [GeomeanAggregate()]
+        eval1 = eval1.process(pipeline)
+
+        expected_a = (1 * 1 * 3 * 4 * 5) ** (1/5)
+        expected_b = expected_c = (1 * 2 * 3 * 4 * 5) ** (1/5)
+        expected_df = pd.DataFrame(
+            [
+                {"a": expected_a, "b": expected_b, "c": expected_c}
+            ]
+        )
+        assert_frame_equal(eval1.get_df(), expected_df)
+        assert eval1.get_eval_id() == 20

-Original file line number
+Diff line change
 requests-mock
 seaborn
 jinja2
 +scipy
 pylint
 pytest