Skip to content

Commit 34f644b

Browse files
authored
Merge pull request #12 from rnihesh/download-py
corrected download.py
2 parents 482c410 + e60b053 commit 34f644b

File tree

3 files changed

+43
-33
lines changed

3 files changed

+43
-33
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,4 +206,5 @@ cython_debug/
206206

207207

208208
# Temporary Directory for S3 Updater
209-
local_sc_judgments_data/
209+
local_sc_judgments_data/
210+
index_cache/

download.py

Lines changed: 39 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import json
55
import logging
66
import re
7-
import shutil
87
import tempfile
98
import threading
109
import time
@@ -86,16 +85,25 @@
8685
captcha_tmp_dir.mkdir(parents=True, exist_ok=True)
8786
temp_files_dir.mkdir(parents=True, exist_ok=True)
8887

89-
S3_BUCKET = "indian-supreme-court-judgments-test"
88+
S3_BUCKET = "indian-supreme-court-judgments"
9089
S3_PREFIX = ""
9190
LOCAL_DIR = Path("./local_sc_judgments_data")
9291
PACKAGES_DIR = Path("./packages")
9392
IST = timezone(timedelta(hours=5, minutes=30))
9493

9594

9695
def get_json_file(file_path) -> dict:
97-
with open(file_path, "r") as f:
98-
return json.load(f)
96+
try:
97+
with open(file_path, "r") as f:
98+
content = f.read().strip()
99+
if not content:
100+
return {} # Return empty dict for empty file
101+
return json.loads(content)
102+
except FileNotFoundError:
103+
return {} # Return empty dict if file doesn't exist
104+
except json.JSONDecodeError:
105+
logging.warning(f"Invalid JSON in {file_path}, returning empty dict")
106+
return {}
99107

100108

101109
def get_tracking_data():
@@ -960,6 +968,7 @@ def download_pdf(self, pdf_info, lang_code):
960968
verify=False,
961969
headers=self.get_headers(),
962970
timeout=30,
971+
allow_redirects=True,
963972
)
964973

965974
# Validate response
@@ -1246,16 +1255,16 @@ def get_latest_date_from_metadata(force_check_files=False):
12461255
s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
12471256
current_year = datetime.now().year
12481257

1249-
# Updated path for metadata index
1250-
index_path = LOCAL_DIR / str(current_year) / "metadata.index.json"
1258+
# Create a separate directory for index files outside of LOCAL_DIR
1259+
index_cache_dir = Path("./index_cache")
1260+
index_cache_dir.mkdir(parents=True, exist_ok=True)
1261+
1262+
# Updated path for metadata index - store outside LOCAL_DIR
1263+
index_path = index_cache_dir / f"{current_year}_metadata.index.json"
12511264
index_key = f"metadata/zip/year={current_year}/metadata.index.json"
12521265

12531266
if not force_check_files:
12541267
try:
1255-
# Ensure year directory exists
1256-
year_dir = LOCAL_DIR / str(current_year)
1257-
year_dir.mkdir(parents=True, exist_ok=True)
1258-
12591268
# Try to get current year index
12601269
s3.download_file(S3_BUCKET, index_key, str(index_path))
12611270
with open(index_path, "r") as f:
@@ -1479,29 +1488,28 @@ def generate_parquet_from_local_metadata(local_dir, s3_bucket):
14791488
)
14801489

14811490
# AFTER the with block completes (archives are now uploaded to S3)
1482-
# Determine which years were just downloaded
1483-
downloaded_years = set()
1484-
for year_dir in LOCAL_DIR.glob("*"):
1485-
if year_dir.is_dir() and year_dir.name.isdigit():
1486-
downloaded_years.add(year_dir.name)
1487-
1488-
if downloaded_years:
1489-
logger.info(f"Found new data for years: {sorted(downloaded_years)}")
1490-
# Process metadata AFTER archives are uploaded to S3
1491-
logger.info(
1492-
f"Processing metadata files for years: {sorted(downloaded_years)}..."
1493-
)
1491+
# Check if any new files were actually downloaded in this run
1492+
if latest_date.date() < today:
1493+
# Only process if we actually ran the downloader and found new data
1494+
logger.info("Checking for newly downloaded data...")
1495+
downloaded_years = set()
1496+
for year_dir in LOCAL_DIR.glob("*"):
1497+
if year_dir.is_dir() and year_dir.name.isdigit():
1498+
downloaded_years.add(year_dir.name)
1499+
1500+
if downloaded_years:
1501+
logger.info(f"Found new data for years: {sorted(downloaded_years)}")
1502+
# Process metadata AFTER archives are uploaded to S3
1503+
logger.info(
1504+
f"Processing metadata files for years: {sorted(downloaded_years)}..."
1505+
)
14941506

1495-
# Now use the standard S3 processor since files are already uploaded
1496-
generate_parquet_from_metadata(S3_BUCKET, downloaded_years)
1507+
# Now use the standard S3 processor since files are already uploaded
1508+
generate_parquet_from_metadata(S3_BUCKET, downloaded_years)
1509+
else:
1510+
logger.info("No new files downloaded. Skipping parquet conversion.")
14971511
else:
1498-
logger.info("No new years to process for parquet conversion.")
1499-
1500-
# Clean up LOCAL_DIR after processing
1501-
if LOCAL_DIR.exists():
1502-
logger.info(f"Cleaning up local data directory {LOCAL_DIR}...")
1503-
shutil.rmtree(LOCAL_DIR, ignore_errors=True)
1504-
logger.info("✅ Local data directory deleted")
1512+
logger.info("Data is already up to date. Skipping parquet conversion.")
15051513
else:
15061514
run(
15071515
args.start_date,

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,5 @@ urllib3
1010
requests
1111
torch
1212
onnx
13-
onnxruntime
13+
onnxruntime
14+
pyarrow>=21.0.0

0 commit comments

Comments
 (0)