|
4 | 4 | import json |
5 | 5 | import logging |
6 | 6 | import re |
7 | | -import shutil |
8 | 7 | import tempfile |
9 | 8 | import threading |
10 | 9 | import time |
|
86 | 85 | captcha_tmp_dir.mkdir(parents=True, exist_ok=True) |
87 | 86 | temp_files_dir.mkdir(parents=True, exist_ok=True) |
88 | 87 |
|
89 | | -S3_BUCKET = "indian-supreme-court-judgments-test" |
| 88 | +S3_BUCKET = "indian-supreme-court-judgments" |
90 | 89 | S3_PREFIX = "" |
91 | 90 | LOCAL_DIR = Path("./local_sc_judgments_data") |
92 | 91 | PACKAGES_DIR = Path("./packages") |
93 | 92 | IST = timezone(timedelta(hours=5, minutes=30)) |
94 | 93 |
|
95 | 94 |
|
96 | 95 | def get_json_file(file_path) -> dict: |
97 | | - with open(file_path, "r") as f: |
98 | | - return json.load(f) |
| 96 | + try: |
| 97 | + with open(file_path, "r") as f: |
| 98 | + content = f.read().strip() |
| 99 | + if not content: |
| 100 | + return {} # Return empty dict for empty file |
| 101 | + return json.loads(content) |
| 102 | + except FileNotFoundError: |
| 103 | + return {} # Return empty dict if file doesn't exist |
| 104 | + except json.JSONDecodeError: |
| 105 | + logging.warning(f"Invalid JSON in {file_path}, returning empty dict") |
| 106 | + return {} |
99 | 107 |
|
100 | 108 |
|
101 | 109 | def get_tracking_data(): |
@@ -960,6 +968,7 @@ def download_pdf(self, pdf_info, lang_code): |
960 | 968 | verify=False, |
961 | 969 | headers=self.get_headers(), |
962 | 970 | timeout=30, |
| 971 | + allow_redirects=True, |
963 | 972 | ) |
964 | 973 |
|
965 | 974 | # Validate response |
@@ -1246,16 +1255,16 @@ def get_latest_date_from_metadata(force_check_files=False): |
1246 | 1255 | s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED)) |
1247 | 1256 | current_year = datetime.now().year |
1248 | 1257 |
|
1249 | | - # Updated path for metadata index |
1250 | | - index_path = LOCAL_DIR / str(current_year) / "metadata.index.json" |
| 1258 | + # Create a separate directory for index files outside of LOCAL_DIR |
| 1259 | + index_cache_dir = Path("./index_cache") |
| 1260 | + index_cache_dir.mkdir(parents=True, exist_ok=True) |
| 1261 | + |
| 1262 | + # Updated path for metadata index - store outside LOCAL_DIR |
| 1263 | + index_path = index_cache_dir / f"{current_year}_metadata.index.json" |
1251 | 1264 | index_key = f"metadata/zip/year={current_year}/metadata.index.json" |
1252 | 1265 |
|
1253 | 1266 | if not force_check_files: |
1254 | 1267 | try: |
1255 | | - # Ensure year directory exists |
1256 | | - year_dir = LOCAL_DIR / str(current_year) |
1257 | | - year_dir.mkdir(parents=True, exist_ok=True) |
1258 | | - |
1259 | 1268 | # Try to get current year index |
1260 | 1269 | s3.download_file(S3_BUCKET, index_key, str(index_path)) |
1261 | 1270 | with open(index_path, "r") as f: |
@@ -1479,29 +1488,28 @@ def generate_parquet_from_local_metadata(local_dir, s3_bucket): |
1479 | 1488 | ) |
1480 | 1489 |
|
1481 | 1490 | # AFTER the with block completes (archives are now uploaded to S3) |
1482 | | - # Determine which years were just downloaded |
1483 | | - downloaded_years = set() |
1484 | | - for year_dir in LOCAL_DIR.glob("*"): |
1485 | | - if year_dir.is_dir() and year_dir.name.isdigit(): |
1486 | | - downloaded_years.add(year_dir.name) |
1487 | | - |
1488 | | - if downloaded_years: |
1489 | | - logger.info(f"Found new data for years: {sorted(downloaded_years)}") |
1490 | | - # Process metadata AFTER archives are uploaded to S3 |
1491 | | - logger.info( |
1492 | | - f"Processing metadata files for years: {sorted(downloaded_years)}..." |
1493 | | - ) |
| 1491 | + # Check if any new files were actually downloaded in this run |
| 1492 | + if latest_date.date() < today: |
| 1493 | + # Only process if we actually ran the downloader and found new data |
| 1494 | + logger.info("Checking for newly downloaded data...") |
| 1495 | + downloaded_years = set() |
| 1496 | + for year_dir in LOCAL_DIR.glob("*"): |
| 1497 | + if year_dir.is_dir() and year_dir.name.isdigit(): |
| 1498 | + downloaded_years.add(year_dir.name) |
| 1499 | + |
| 1500 | + if downloaded_years: |
| 1501 | + logger.info(f"Found new data for years: {sorted(downloaded_years)}") |
| 1502 | + # Process metadata AFTER archives are uploaded to S3 |
| 1503 | + logger.info( |
| 1504 | + f"Processing metadata files for years: {sorted(downloaded_years)}..." |
| 1505 | + ) |
1494 | 1506 |
|
1495 | | - # Now use the standard S3 processor since files are already uploaded |
1496 | | - generate_parquet_from_metadata(S3_BUCKET, downloaded_years) |
| 1507 | + # Now use the standard S3 processor since files are already uploaded |
| 1508 | + generate_parquet_from_metadata(S3_BUCKET, downloaded_years) |
| 1509 | + else: |
| 1510 | + logger.info("No new files downloaded. Skipping parquet conversion.") |
1497 | 1511 | else: |
1498 | | - logger.info("No new years to process for parquet conversion.") |
1499 | | - |
1500 | | - # Clean up LOCAL_DIR after processing |
1501 | | - if LOCAL_DIR.exists(): |
1502 | | - logger.info(f"Cleaning up local data directory {LOCAL_DIR}...") |
1503 | | - shutil.rmtree(LOCAL_DIR, ignore_errors=True) |
1504 | | - logger.info("✅ Local data directory deleted") |
| 1512 | + logger.info("Data is already up to date. Skipping parquet conversion.") |
1505 | 1513 | else: |
1506 | 1514 | run( |
1507 | 1515 | args.start_date, |
|
0 commit comments