code-kern-ai
diff --git a/‎onetask/__init__.py
Lines changed: 33 additions & 31 deletions b/‎onetask/__init__.py
Lines changed: 33 additions & 31 deletions
diff --git a/‎onetask/api_calls.py
Lines changed: 104 additions & 52 deletions b/‎onetask/api_calls.py
Lines changed: 104 additions & 52 deletions
diff --git a/‎onetask/auto_lf.py
Lines changed: 117 additions & 0 deletions b/‎onetask/auto_lf.py
Lines changed: 117 additions & 0 deletions
@@ -1,11 +1,8 @@
 # -*- coding: utf-8 -*-
 
-from typing import Callable, List
-
-from onetask import api_calls, settings
+from typing import Callable
 from wasabi import msg
-
-from onetask.labeling_function import build_keywords_lf, unpack_python_function
+from onetask import api_calls, settings, util, auto_lf
 
 
 class Client:
@@ -16,38 +13,43 @@ def __init__(
         self.session_token = api_calls.create_session_token(
             user_name=user_name, password=password
         )
+        self.project_id = project_id
         if self.session_token is not None:
             msg.good("Logged in to system.")
+            if not api_calls.ProjectByProjectId(
+                self.project_id, self.session_token
+            ).exists:
+                msg.fail(f"Project with ID {self.project_id} does not exist.")
         else:
             msg.fail("Could not log in. Please check your username and password.")
-        self.project_id = project_id
 
-    def register_custom_lf(self, lf: Callable) -> None:
-        fn_name, source_code, description = unpack_python_function(lf)
-        _ = api_calls.RegisterLabelingFunctionCall(
-            fn_name=fn_name,
-            source_code=source_code,
-            description=description,
-            project_id=self.project_id,
-            session_token=self.session_token,
-        )
-        msg.good(f"Registered labeling function '{fn_name}'.")
+    def manually_labeled_records(self, as_df: bool = True):
+        fetched_records = api_calls.ManuallyLabeledRecords(
+            self.project_id, self.session_token
+        ).data
+        records = util.unpack_records(fetched_records)
+        if as_df and len(records) > 0:
+            return util.records_to_df(records)
+        else:
+            return records
 
-    def register_keywords_lf(
-        self,
-        label: str,
-        keywords: List[str],
-        attributes: List[str],
-        lowercase: bool = True,
+    def autogenerate_regex_labeling_functions(
+        self, nlp, attribute, num_functions: int = 10
     ):
-        fn_name, source_code, description = build_keywords_lf(
-            label, keywords, attributes, lowercase
+        records = self.manually_labeled_records(as_df=True)
+        if len(records) > 0:
+            candidates = auto_lf.derive_regex_candidates(
+                nlp, records, attribute, most_common=num_functions
+            )
+            auto_lf.create_regex_fns(records, candidates, attribute)
+        else:
+            msg.fail("No manually labeled records available!")
+
+    def register_lf(self, lf: Callable) -> None:
+        project_id, name, source_code, docs = util.unpack_python_function(
+            lf, self.project_id
         )
-        _ = api_calls.RegisterLabelingFunctionCall(
-            fn_name=fn_name,
-            source_code=source_code,
-            description=description,
-            project_id=self.project_id,
-            session_token=self.session_token,
+        api_calls.CreateLabelingFunction(
+            project_id, name, source_code, docs, self.session_token
         )
-        msg.good(f"Registered labeling function '{fn_name}'.")
+        msg.good(f"Registered labeling function '{name}'.")
@@ -1,18 +1,13 @@
 # -*- coding: utf-8 -*-
 import pkg_resources
-
 from onetask import exceptions, settings
+import requests
 
 try:
     version = pkg_resources.get_distribution("onetask").version
 except pkg_resources.DistributionNotFound:
     version = "noversion"
 
-import requests
-from typing import Dict, Any, Optional, Union
-
-from better_abc import ABCMeta, abstract_attribute
-
 
 # no call to the onetask system, therefore include it here
 def create_session_token(user_name: str, password: str):
@@ -40,32 +35,29 @@ def create_session_token(user_name: str, password: str):
     return session_token
 
 
-class OneTaskCall(metaclass=ABCMeta):
-    def __init__(
-        self, url: str, session_token: str, data: Optional[Dict[str, Any]] = None
-    ):
+class GraphQLRequest:
+    def __init__(self, query, variables, session_token):
+        self.query = query
+        self.variables = variables
+        self.session_token = session_token
+
+    def execute(self):
+        body = {
+            "query": self.query,
+            "variables": self.variables,
+        }
+
         headers = {
             "Content-Type": "application/json",
             "User-Agent": f"python-sdk-{version}",
-            "Authorization": f"Bearer {session_token}",
+            "Authorization": f"Bearer {self.session_token}",
         }
 
-        if data is None:
-            self.response = requests.request(self.method, url, headers=headers)
-        else:
-            self.response = requests.request(
-                self.method, url, json=data, headers=headers
-            )
-
-    @abstract_attribute
-    def method(self):
-        pass
+        response = requests.post(url=settings.graphql(), json=body, headers=headers)
 
-    @property
-    def content(self) -> Union[Dict[str, Any], exceptions.APIError]:
-        status_code = self.response.status_code
+        status_code = response.status_code
 
-        json_data = self.response.json()
+        json_data = response.json()
 
         if status_code == 200:
             return json_data
@@ -80,35 +72,95 @@ def content(self) -> Union[Dict[str, Any], exceptions.APIError]:
             raise exception
 
 
-class PostCall(OneTaskCall):
-    def __init__(
-        self,
-        url: str,
-        session_token: str,
-        data: Dict[str, Any],
-    ):
-        self.method = "POST"
+class ProjectByProjectId(GraphQLRequest):
+    def __init__(self, project_id, session_token):
+        QUERY = """
+        query ($projectId: ID!) {
+            projectByProjectId(projectId: $projectId) {
+                id
+                labels {
+                    edges {
+                        node {
+                            name
+                        }
+                    }
+                }
+            }
+        }
+        """
+
+        variables = {
+            "projectId": project_id,
+        }
 
-        super().__init__(url=url, session_token=session_token, data=data)
+        super().__init__(QUERY, variables, session_token)
+        try:
+            self.data = self.execute()
+            self.exists = self.data.get("data").get("projectByProjectId") is not None
+        except exceptions.APIError:
+            self.exists = False
+
+
+class ManuallyLabeledRecords(GraphQLRequest):
+    def __init__(self, project_id, session_token):
+        data = ProjectByProjectId(project_id, session_token).data
+        edges = data["data"]["projectByProjectId"]["labels"]["edges"]
+        manual = [edge["node"]["name"] for edge in edges]
+
+        QUERY = """
+        query ($projectId: ID!, $manual: [String!]) {
+            searchRecords(projectId: $projectId, manual: $manual) {
+                data
+                    labelAssociations {
+                    edges {
+                        node {
+                            label {
+                                name
+                            }
+                            source
+                        }
+                    }
+                }
+            }
+        }
 
+        """
+
+        variables = {"projectId": project_id, "manual": manual}
+
+        super().__init__(QUERY, variables, session_token)
+        self.data = self.execute()
+
+
+class CreateLabelingFunction(GraphQLRequest):
+    def __init__(self, project_id, name, function, description, session_token):
+        QUERY = """
+        mutation (
+            $projectId: ID!, 
+            $name: String!, 
+            $function: String!, 
+            $description: String!
+        ) {
+            createLabelingFunction(
+                projectId: $projectId, 
+                name: $name, 
+                function: 
+                $function, 
+                description: $description
+            ) {
+                labelingFunction {
+                    id
+                }
+            }
+        }
+        """
 
-class RegisterLabelingFunctionCall(PostCall):
-    def __init__(
-        self,
-        fn_name: str,
-        source_code: str,
-        description: str,
-        project_id: str,
-        session_token: str,
-    ):
-        body = {
-            "project_id": project_id,
-            "name": fn_name,
-            "function": source_code,
+        variables = {
+            "projectId": project_id,
+            "name": name,
+            "function": function,
             "description": description,
         }
-        super().__init__(
-            url=settings.get_labeling_function_url(),
-            session_token=session_token,
-            data=body,
-        )
+
+        super().__init__(QUERY, variables, session_token)
+        _ = self.execute()
@@ -0,0 +1,117 @@
+from collections import defaultdict
+from tqdm import tqdm
+from collections import Counter
+import re
+from collections import defaultdict
+import numpy as np
+from wasabi import msg
+
+
+def derive_regex_candidates(nlp, df, attribute, most_common=10):
+    if len(df) < 100:
+        msg.warn(
+            "Only very few records to analyze; it's best to continue labeling further records before analysis"
+        )
+
+    def normalize_token(token):
+        if "d" in token.shape_ and not "x" in token.shape_:
+            return token.shape_.replace("d", "[0-9]")
+        else:
+            return token.text
+
+    def is_relevant_token(token):
+        return not (token.is_punct or token.is_stop or token.is_bracket)
+
+    candidates = []
+    for text in tqdm(df[attribute], total=len(df)):
+        doc = nlp(text.lower())
+        for token in doc:
+            if is_relevant_token(token):
+                has_children = False
+                for token_left in token.lefts:
+                    if is_relevant_token(token_left):
+                        prefix = "^" if token_left.idx == 0 else " "
+                        suffix = "$" if token.idx == len(doc) - 1 else " "
+                        candidate = f"{prefix}{normalize_token(token_left)}.*?{normalize_token(token)}{suffix}"
+                        candidates.append(candidate)
+                        has_children = True
+                for token_right in token.rights:
+                    if is_relevant_token(token_right):
+                        prefix = "^" if token.idx == 0 else " "
+                        suffix = "$" if token_right.idx == len(doc) - 1 else " "
+                        candidate = f"{prefix}{normalize_token(token)}.*?{normalize_token(token_right)}{suffix}"
+                        candidates.append(candidate)
+                        has_children = True
+                if not has_children:
+                    prefix = "^" if token.idx == 0 else " "
+                    suffix = "$" if token.idx == len(doc) - 1 else " "
+                    candidate = f"{prefix}{normalize_token(token)}{suffix}"
+                    candidates.append(candidate)
+    return [regex for regex, _ in Counter(candidates).most_common(most_common)]
+
+
+def create_regex_fns(df, candidates, regex_col, label_col="label"):
+    def regex_explainer(regex, attribute):
+        description = ""
+        terms = regex.replace("^", "").replace("$", "").split(".*?")
+        if "^" in regex:
+            description += f"attribute '{attribute}' starts with term '{terms[0]}'"
+            if len(terms) > 1:
+                for term in terms[1:]:
+                    description += f" (in-)directly  followed by term '{term}'"
+            if "$" in regex:
+                description += " and then ends"
+        elif "$" in regex:
+            description += (
+                f"attribute '{attribute}' somewhere contains term '{terms[0]}'"
+            )
+            if len(terms) > 1:
+                for term in terms[1:]:
+                    description += f" (in-)directly followed by term '{term}'"
+            description += " and then ends"
+        else:
+            description += (
+                f"attribute '{attribute}' somewhere contains term '{terms[0]}'"
+            )
+            if len(terms) > 1:
+                for term in terms[1:]:
+                    description += f" followed by term '{term}'"
+        if "[0-9]" in regex:
+            description += ", where [0-9] is an arbitrary number"
+        description += "."
+        return description
+
+    def build_regex_lf(regex, attribute, prediction, iteration):
+        source_code = f"""
+def regex_{iteration}(record):
+    '''{regex_explainer(regex, attribute)}'''
+    import re
+    if re.search(r'{regex}', record['{attribute}'].lower()):
+        return '{prediction}'
+
+client.register_lf(regex_{iteration})
+        """
+
+        return source_code.strip()
+
+    regex_nr = 1
+    for regex in candidates:
+        labels = defaultdict(int)
+        for text, label in zip(df[regex_col], df[label_col]):
+            if re.search(regex, text.lower()):
+                labels[label] += 1
+        coverage = sum(labels.values())
+        if coverage > 0:
+            regex_prediction, max_count = None, 0
+            for prediction, count in labels.items():
+                if count > max_count:
+                    max_count = count
+                    regex_prediction = prediction
+            precision = np.round(labels[regex_prediction] / coverage, 2)
+            coverage = np.round(coverage / len(df), 2)
+            if precision > 0.75 and coverage >= 0.01:
+                lf = build_regex_lf(regex, regex_col, regex_prediction, regex_nr)
+                regex_nr += 1
+                print(f"# Cov:\t{coverage}\tPrec:{precision}")
+                print(lf)
+                print()