feat(example): added suggest_categories_gpt script (#66)

ErikBjare · web-flow · commit 740ca220b178 · 2022-07-12T13:43:05.000+02:00
diff --git a/examples/suggest_categories.py b/examples/suggest_categories.py
@@ -7,17 +7,28 @@
 from collections import Counter
 from datetime import datetime, timedelta, timezone
 from tabulate import tabulate
-from typing import Dict
+from typing import Dict, List, Tuple, Any
 
 from aw_core import Event
 import aw_client
 from aw_client import queries
 
+
 # set up client
 awc = aw_client.ActivityWatchClient("test")
 
 
-def get_events():
+def example_categories():
+    # TODO: Use tools in aw-research to load categories from toml file
+    return [
+        (
+            ("Work", "ActivityWatch"),
+            {"type": "regex", "regex": "aw-|activitywatch", "ignore_case": True},
+        ),
+    ]
+
+
+def get_events(categories=List[Tuple[Tuple[str], Dict[str, Any]]]):
     """
     Retrieves AFK-filtered events, only returns events which are Uncategorized.
     """
@@ -26,14 +37,6 @@ def get_events():
     now = datetime.now(tz=timezone.utc)
     timeperiods = [(start, now)]
 
-    # TODO: Use tools in aw-research to load categories from toml file
-    categories = [
-        (
-            ["Work"],
-            {"type": "regex", "regex": "aw-|activitywatch", "ignore_case": True},
-        ),
-    ]
-
     canonicalQuery = queries.canonicalEvents(
         queries.DesktopQueryParams(
             bid_window="aw-watcher-window_",
@@ -66,8 +69,9 @@ def events2words(events):
                         yield (word, e.duration)
 
 
-if __name__ == "__main__":
-    events = get_events()
+def main():
+    categories = example_categories()
+    events = get_events(categories)
 
     # find most common words, by duration
     corpus: Dict[str, timedelta] = Counter()  # type: ignore
@@ -79,3 +83,7 @@ def events2words(events):
     # The top words are rarely useful for categorization, as they are usually browsers and other categories
     # of activity which are too broad for it to make sense as a rule (except as a fallback).
     print(tabulate(corpus.most_common(50), headers=["word", "duration"]))  # type: ignore
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/suggest_categories_gpt.py b/examples/suggest_categories_gpt.py
@@ -0,0 +1,213 @@
+"""
+Uses GPT-3 to suggest new categories.
+
+Builds on suggest_categories.py for basic operations, like getting events.
+"""
+
+from datetime import datetime, timezone
+from typing import Any, Dict, List, Tuple
+
+from aw_core import Event
+from aw_transform.classify import Rule, categorize
+
+from suggest_categories import example_categories, get_events
+
+Category = Tuple[List[str], Dict[str, Any]]
+
+
+def prompt_preamble(categories: List[Category]) -> str:
+    categories_str = "\n\n".join(
+        [
+            f" - Category: {' > '.join(name)}\n   Regex: {rule['regex']}"
+            for name, rule in categories
+        ]
+    )
+
+    prompt = f"""We will classify window titles into user-defined categories defined by regular expressions.
+If a suitable one doesn't exists, we will create one with a suitable regex.
+
+Existing categories:
+
+{categories_str}
+
+---
+
+What category should "ActivityWatch - wwww.github.com" be in?
+Category: Work > ActivityWatch
+
+What category should "reddit: the front page of the internet" be in?
+New Category: Media > Social > Reddit
+Regex: reddit
+
+What category should "Twitter" be in?
+New Category: Media > Social > Twitter
+Regex: Twitter
+
+What category should "Demis Hassabis: DeepMind - AI, Superintelligence & the Future of Humanity | Lex Fridman Podcast - YouTube - Mozilla Firefox" be in?
+New Category: Media > Video > YouTube
+Regex: YouTube
+
+What category should "Tweetdeck" be in?
+Modify Category: Media > Social > Twitter
+Append Regex: Tweetdeck
+
+What category should "cloudflare-ipfs.com | 524: A timeout occurred - cloudflare-ipfs.com - Mozilla Firefox" be in?
+Skip: No suitable category found or to suggest, best left as uncategorized.
+
+What category should "Mozilla Firefox" be in?
+Skip: No suitable category found or to suggest, best left as uncategorized.
+
+What category should "RimWorld" be in?
+New Category: Games > RimWorld
+Regex: RimWorld
+
+What category should "Minecraft" be in?
+New Category: Games > Minecraft
+Regex: Minecraft
+
+What category should "Free Porn Videos & Sex Movies - Porno, XXX, Porn Tube | Pornhub — Mozilla Firefox" be in?
+New Category: Media > Porn
+Regex: Pornhub"""
+
+    return prompt
+
+
+def process_prompt(prompt, categories, quiet=False) -> List[Category]:
+    """processes the prompt preamble for categories created/modified in the prompt"""
+    for entry in prompt.split("---", 1)[1].split("\n\n"):
+        if not entry.strip():
+            continue
+        # FIXME: Will break if string contains double-quotes
+        title = entry.split("\n", 1)[0].split('"', 2)[1]
+        response = entry.strip().split("\n", 1)[1]
+        categories = parse_gpt_response(response, categories, title=title, quiet=quiet)
+    return categories
+
+
+def gpt_suggest(event: Event, categories: List[Category]) -> List[Category]:
+    """
+    Use OpenAI GPT-3 to suggest new categories.
+
+    Takes an uncategorized event and the current categories, and returns a list of suggested categories.
+    """
+    import os
+    import openai
+    from copy import deepcopy
+
+    openai.api_key = os.getenv("OPENAI_API_KEY")
+
+    title = event.data["title"]
+    categories = deepcopy(categories)
+
+    prompt = prompt_preamble(categories)
+    categories = process_prompt(prompt, categories, quiet=True)
+
+    prompt = f"""{prompt}
+
+What category should "{title}" be in?"""
+
+    response = openai.Completion.create(
+        model="text-davinci-002",
+        prompt=prompt,
+        temperature=0,
+        max_tokens=64,
+        top_p=1.0,
+        frequency_penalty=0.0,
+        presence_penalty=0.0,
+    )
+    text = response["choices"][0]["text"]
+    print("-" * 80)
+    # print(prompt + text)
+    # print("> " + text.strip().replace("\n", "\n> "))
+
+    return parse_gpt_response(text, categories, title)
+
+
+def check_is_category(text: str, category: Category) -> bool:
+    name, rule = category
+    event = categorize(
+        [Event(timestamp=datetime.now(tz=timezone.utc), data={"title": text})],
+        [(list(name), Rule(rule))],
+    )[0]
+    return event.data["$category"] == list(category[0])
+
+
+def parse_gpt_response(text: str, categories: List[Category], title=None, quiet=False):
+    category_names = [tuple(name) for name, _ in categories]
+
+    line1, *lines = text.strip().split("\n")
+    if line1.startswith("Category:"):
+        # chose existing category
+        cat_name = tuple(line1.split(":", 1)[1].strip().split(" > "))
+        if not quiet:
+            print(f"Chose existing category {cat_name}  (title: {title})")
+        if cat_name not in [tuple(name[: len(cat_name)]) for name in category_names]:
+            print(f"No category named {cat_name} found, skipping")
+    elif line1.startswith("New Category:"):
+        line2 = lines[0]
+        category = line1.split(":", 1)[1].strip().split(" > ")
+        regex = line2.strip().split(":", 1)[1].strip()
+        if not quiet:
+            print(f"Added category {category} with regex {regex}  (title: {title})")
+        cat: Category = (
+            category,
+            {"type": "regex", "regex": regex, "ignore_case": True},
+        )
+        if title and not check_is_category(title, cat):
+            print(
+                f"Bad suggested regex '{cat[1]['regex']}'. Title '{title}' does not match category {category}."
+            )
+        else:
+            categories.append(cat)
+    elif line1.startswith("Modify Category:"):
+        line2 = lines[0]
+        category = line1.split(":", 1)[1].strip().split(" > ")
+        assert line2.startswith("Append Regex:")
+        regex = line2.strip().split(":", 1)[1].strip()
+        # get existing category
+        for name, rule in categories:
+            if name == category:
+                rule["regex"] += "|" + regex
+                if not quiet:
+                    print(f"Appended {regex} to {category}")
+                break
+    elif line1.startswith("Skip:"):
+        pass
+    else:
+        print(f"Unknown response: '{text.strip()}'")
+
+    return categories
+
+
+def main():
+    categories = example_categories()
+    events = get_events(categories)
+
+    events_by_dur = sorted(events, key=lambda e: e.duration, reverse=True)
+    for event in events_by_dur[:100]:
+        # re-categorize and skip event if it is already categorized by a new rule
+        event, *_ = categorize(
+            [event], [(list(name), Rule(rule)) for name, rule in categories]
+        )
+        if list(event.data["$category"]) != ["Uncategorized"]:
+            continue
+
+        categories = gpt_suggest(event, categories)
+
+
+def test_parse_gpt_response():
+    categories = example_categories()
+    prompt = prompt_preamble(categories)
+    categories = process_prompt(prompt, categories)
+
+    cat_twitter = list(
+        (name, rule)
+        for name, rule in categories
+        if tuple(name) == ("Media", "Social", "Twitter")
+    )
+    assert len(cat_twitter) == 1
+    assert cat_twitter[0][1]["regex"] == "Twitter|Tweetdeck"
+
+
+if __name__ == "__main__":
+    main()