feat: organize parsed content (#21)

RanKKI · xuweidong · web-flow · commit cb1db53eec0c · 2025-04-19T14:38:39.000+10:00
* word md 分类目录,word 中文文件名

* refactor: fix typo and logic

---------

Co-authored-by: xuweidong &lt;determined_xw@126.com&gt;
diff --git a/scripts/manager/cache.py b/scripts/manager/cache.py
@@ -60,13 +60,20 @@ def OUTPUT_PATH(self):
 
     def write_law(self, path: Path, data: List[str]):
         full_path = self.OUTPUT_PATH / path
-        folder_path = full_path.parents[0]
+        folder_path = full_path.parent
         if not folder_path.exists():
-            folder_path.mkdir()
+            folder_path.mkdir(parents=True)
         with open(full_path, "w") as f:
             result = "\n\n".join(data)
             result = result.replace("<!-- TABLE -->\n", "<!-- TABLE -->")
             result = result.replace("\n<!-- TABLE END -->", "<!-- TABLE END -->")
             result = result.replace("|\n\n|", "|\n|")
             result = re.sub("\n{2,}", "\n\n", result)
             f.write(result)
+
+
+    def word_output_path(self, key: str, type: CacheType, path: Path | str):
+        p = self.base_path / type.value / path
+        if not p.exists():
+            p.mkdir(parents=True)
+        return p / key
diff --git a/scripts/manager/request.py b/scripts/manager/request.py
@@ -1,15 +1,16 @@
 import json
+import logging
 import os
 import re
 import urllib.request
 from hashlib import sha1
+from pathlib import Path
 from time import sleep
-import logging
+
 import requests
 from docx import Document
 from manager.cache import CacheManager, CacheType
 
-
 logger = logging.getLogger(__name__)
 
 REQUEST_HEADER = {
@@ -93,11 +94,14 @@ def get_html(self, url) -> str:
         self.cache.set(cache_key, CacheType.HTMLDocument, ret, "html")
         return ret
 
-    def get_word(self, url) -> Document:
+    def get_word(self, url: str, title_or_output_path: Path) -> Document:
         filename = os.path.basename(url)
-        cache_key = filename.split(".")[0]
+        _, file_extension = os.path.splitext(filename)
+
+        title = title_or_output_path.name
+        parent_path = title_or_output_path.parent
 
-        ok, path = self.cache.is_exists(cache_key, CacheType.WordDocument, "docx")
+        ok, path = self.cache.is_exists(title, CacheType.WordDocument, file_extension)
         if not ok:
             if not re.match(".*docx$", filename):
                 return None
diff --git a/scripts/parsers/word.py b/scripts/parsers/word.py
@@ -1,16 +1,17 @@
 import logging
 import re
+from pathlib import Path
 from typing import List, Tuple
 
-from docx.document import Document as _Document
+from common import LINE_RE
 from docx import Document
+from docx.document import Document as _Document
+from docx.oxml import CT_SectPr
 from docx.oxml.table import CT_Tbl
 from docx.oxml.text.paragraph import CT_P
-from docx.oxml import CT_SectPr
 from docx.table import Table, _Cell, _Row
 from docx.text.paragraph import Paragraph
 from parsers.base import Parser
-from common import LINE_RE
 
 logger = logging.getLogger(__name__)
 
@@ -49,12 +50,14 @@ def iter_block_items(self, parent):
                 yield Table(child, parent)
 
     def parse(self, result, detail) -> Tuple[str, str, List[str]]:
-        document = self.request.get_word(detail["path"])
+        level = result["level"].strip()
+        title = result["title"].strip()
+
+        document = self.request.get_word(detail["path"], Path(level) / title)
         if not document:
             logger.warning(f"document {detail['path']} not exists")
             return
 
-        title = result["title"].strip()
         return self.parse_document(document, title)
 
     def parse_document(self, document, title):
diff --git a/scripts/request.py b/scripts/request.py
@@ -78,6 +78,8 @@ def parse_law(self, item):
         detail = self.request.get_law_detail(item["id"])
         result = detail["result"]
         title = result["title"]
+        # 类别
+        level = result["level"]
         files = self.__reorder_files(result["body"])
         logger.debug(f"parsing {title}")
         if len(files) == 0:
@@ -96,7 +98,7 @@ def parse_law(self, item):
             if not filedata:
                 continue
 
-            output_path = self.__get_law_output_path(title, item["publish"])
+            output_path = level / self.__get_law_output_path(title, item["publish"])
             logger.debug(f"parsing {title} success")
             self.cache.write_law(output_path, filedata)