internetarchive · bettercallok · Dec 16, 2025
diff --git a/openlibrary/solr/data_provider.py b/openlibrary/solr/data_provider.py
@@ -550,6 +550,16 @@ def preload_editions_of_works(self, work_keys: Iterable[str]):
 
     def clear_cache(self):
         super().clear_cache()
+        # Log cache statistics to monitor effectiveness and validate the fix
+        if self.cache or self.ia_cache or self.edition_keys_of_works_cache:
+            logger.debug(
+                "Cache stats before clearing - documents: %d, IA metadata: %d, "
+                "work→editions: %d, redirects: %d",
+                len(self.cache),
+                len(self.ia_cache),
+                len(self.edition_keys_of_works_cache),
+                len(self.redirect_cache),
+            )
         self.cache.clear()
         self.redirect_cache.clear()
         self.edition_keys_of_works_cache.clear()
diff --git a/scripts/solr_updater/solr_updater.py b/scripts/solr_updater/solr_updater.py
@@ -230,8 +230,8 @@ async def update_keys(keys):
         count += len(chunk)
         await update.do_updates(chunk)
 
-        # Caches should not persist between different calls to update_keys!
-        update.data_provider.clear_cache()
+        # Cache is now cleared after each iteration in main(), not after each batch
+        # This allows cache reuse within a single call to update_keys()
 
     if count:
         logger.info("updated %d documents", count)
@@ -300,6 +300,10 @@ async def main(
         keys = parse_log(records, load_ia_scans)
         count = await update_keys(keys)
 
+        # Clear cache after processing all keys from this iteration
+        # This prevents unbounded cache growth while allowing cache reuse within update_keys()
+        update.data_provider.clear_cache()
+
         if logfile.tell() != offset:
             offset = logfile.tell()
             logger.info("saving offset %s", offset)