new: stop using both ChatOpenAI and ChatLiteLLM

thiswillbeyourgithub · thiswillbeyourgithub · commit 30a0f0c8acc7 · 2025-05-03T15:38:42.000+02:00
ChatLiteLLM seems to now work reliably

Signed-off-by: thiswillbeyourgithub &lt;26625900+thiswillbeyourgithub@users.noreply.github.com&gt;
diff --git a/tests/test_wdoc.py b/tests/test_wdoc.py
@@ -25,7 +25,7 @@
 os.environ["WDOC_TYPECHECKING"] = "crash"
 
 # Default model names if not specified in environment
-# openai needs to be specifically tested because it uses the langchain backend ChatOpenai instead of ChatLiteLLM like the others
+# we are testing different providers just in case there are unexpected backend issues
 WDOC_TEST_OPENAI_MODEL = os.getenv("WDOC_TEST_OPENAI_MODEL", "gpt-4o")
 WDOC_TEST_OPENAI_EVAL_MODEL = os.getenv("WDOC_TEST_OPENAI_EVAL_MODEL", "gpt-4o-mini")
 WDOC_TEST_OPENAI_EMBED_MODEL = os.getenv(
diff --git a/wdoc/docs/help.md b/wdoc/docs/help.md
@@ -315,14 +315,7 @@
     using ntfy.sh to get summaries.
 
 * `--disable_llm_cache`: bool, default `False`
-    * WARNING: The cache is temporarily ignored in non openaillms
-    generations because of an error with langchain's ChatLiteLLM.
-    Basically if you don't use `--private` and use llm form openai,
-    wdoc will use ChatOpenAI with regular caching, otherwise
-    we use ChatLiteLLM with LLM caching disabled.
-    More at https://github.com/langchain-ai/langchain/issues/22389
-
-    disable caching for LLM. All caches are stored in the usual
+    * disables caching for LLM. All caches are stored in the usual
     cache folder for your system. This does not disable caching
     for documents.
 
diff --git a/wdoc/utils/llm.py b/wdoc/utils/llm.py
@@ -14,7 +14,6 @@
 from langchain_core.callbacks import BaseCallbackHandler
 from langchain_core.messages.base import BaseMessage
 from langchain_core.outputs.llm_result import LLMResult
-from langchain_openai import ChatOpenAI
 from loguru import logger
 
 from .env import env
@@ -46,7 +45,7 @@ def load_llm(
     private: bool,
     tags: List[str],
     **extra_model_args,
-) -> Union[ChatLiteLLM, ChatOpenAI, FakeListChatModel]:
+) -> Union[ChatLiteLLM, FakeListChatModel]:
     """load language model"""
     if extra_model_args is None:
         extra_model_args = {}
@@ -145,50 +144,35 @@ def load_llm(
 
     assert private == env.WDOC_PRIVATE_MODE
 
-    if (not private) and (modelname.backend == "openai") and (api_base is None):
-        max_tokens = get_model_max_tokens(modelname)
-        logger.debug(f"Detected max token for model {modelname.original}: {max_tokens}")
-        if "max_tokens" not in extra_model_args:
+    max_tokens = get_model_max_tokens(modelname)
+    logger.debug(f"Detected max token for model {modelname.original}: {max_tokens}")
+    if "max_tokens" not in extra_model_args:
+        # intentionaly limiting max tokens because it can cause bugs
+        if modelname.backend != "ollama":
             extra_model_args["max_tokens"] = int(max_tokens * 0.9)
-        logger.debug(f"Using ChatOpenAI backend for model {modelname.original}")
-        llm = ChatOpenAI(
-            model_name=modelname.model,
-            cache=llm_cache,
-            disable_streaming=True,  # Not needed and might break cache
-            verbose=llm_verbosity,
-            callbacks=[PriceCountingCallback(verbose=llm_verbosity)]
-            + langfuse_callback_holder,  # use langchain's callback to langfuse
-            **extra_model_args,
-        )
-    else:
-        max_tokens = get_model_max_tokens(modelname)
-        logger.debug(f"Detected max token for model {modelname.original}: {max_tokens}")
-        if "max_tokens" not in extra_model_args:
-            # intentionaly limiting max tokens because it can cause bugs
-            if modelname.backend != "ollama":
+        else:
+            if max_tokens <= 10_000:
                 extra_model_args["max_tokens"] = int(max_tokens * 0.9)
             else:
-                if max_tokens <= 10_000:
-                    extra_model_args["max_tokens"] = int(max_tokens * 0.9)
-                else:
-                    logger.debug(
-                        f"Detected an ollama model with large max_tokens ({max_tokens}), they usually overestimate their context window capabilities so we reduce it if the user does not specify a max_tokens kwarg"
-                    )
-                    extra_model_args["max_tokens"] = int(max(max_tokens * 0.2, 4096))
-        logger.debug(f"Using ChatLiteLLM backend for model {modelname.original}")
-        llm = ChatLiteLLM(
-            model_name=modelname.original,
-            disable_streaming=True,  # Not needed and might break cache
-            api_base=api_base,
-            cache=llm_cache,
-            verbose=llm_verbosity,
-            tags=tags,
-            callbacks=[PriceCountingCallback(verbose=llm_verbosity)]
-            + langfuse_callback_holder,
-            user=env.WDOC_LITELLM_USER,
-            **extra_model_args,
-        )
-        litellm.drop_params = True
+                logger.debug(
+                    f"Detected an ollama model with large max_tokens ({max_tokens}), they usually overestimate their context window capabilities so we reduce it if the user does not specify a max_tokens kwarg"
+                )
+                extra_model_args["max_tokens"] = int(max(max_tokens * 0.2, 4096))
+    logger.debug(f"Using ChatLiteLLM backend for model {modelname.original}")
+    llm = ChatLiteLLM(
+        model_name=modelname.original,
+        disable_streaming=True,  # Not needed and might break cache
+        api_base=api_base,
+        cache=llm_cache,
+        verbose=llm_verbosity,
+        tags=tags,
+        callbacks=[PriceCountingCallback(verbose=llm_verbosity)]
+        + langfuse_callback_holder,
+        user=env.WDOC_LITELLM_USER,
+        **extra_model_args,
+    )
+    litellm.drop_params = True
+
     if private:
         assert llm.api_base, "private is set but no api_base for llm were found"
         assert (
diff --git a/wdoc/utils/retrievers.py b/wdoc/utils/retrievers.py
@@ -10,7 +10,6 @@
 # from langchain.storage import LocalFileStore
 from langchain_community.chat_models import ChatLiteLLM
 from langchain_core.retrievers import BaseRetriever
-from langchain_openai import ChatOpenAI
 
 from .misc import cache_dir, get_splitter
 from .prompts import multiquery_parser, prompts
@@ -20,7 +19,7 @@
 
 @optional_typecheck
 def create_multiquery_retriever(
-    llm: Union[ChatLiteLLM, ChatOpenAI],
+    llm: Union[ChatLiteLLM],
     retriever: BaseRetriever,
 ) -> MultiQueryRetriever:
     # advanced mode using pydantic parsers
diff --git a/wdoc/utils/tasks/query.py b/wdoc/utils/tasks/query.py
@@ -18,7 +18,6 @@
 from langchain_community.chat_models.fake import FakeListChatModel
 from langchain_core.runnables import chain
 from langchain_core.runnables.base import RunnableLambda
-from langchain_openai import ChatOpenAI
 from numpy.typing import NDArray
 from tqdm import tqdm
 from loguru import logger
@@ -537,7 +536,7 @@ def semantic_batching(
 
 @optional_typecheck
 def pbar_chain(
-    llm: Union[ChatLiteLLM, ChatOpenAI, FakeListChatModel],
+    llm: Union[ChatLiteLLM, FakeListChatModel],
     len_func: str,
     **tqdm_kwargs,
 ) -> RunnableLambda:
@@ -546,7 +545,7 @@ def pbar_chain(
     @chain
     def actual_pbar_chain(
         inputs: Union[dict, List],
-        llm: Union[ChatLiteLLM, ChatOpenAI, FakeListChatModel] = llm,
+        llm: Union[ChatLiteLLM, FakeListChatModel] = llm,
     ) -> Union[dict, List]:
 
         llm.callbacks[0].pbar.append(
@@ -565,14 +564,14 @@ def actual_pbar_chain(
 
 @optional_typecheck
 def pbar_closer(
-    llm: Union[ChatLiteLLM, ChatOpenAI, FakeListChatModel],
+    llm: Union[ChatLiteLLM, FakeListChatModel],
 ) -> RunnableLambda:
     "close a pbar created by pbar_chain"
 
     @chain
     def actual_pbar_closer(
         inputs: Union[dict, List],
-        llm: Union[ChatLiteLLM, ChatOpenAI, FakeListChatModel] = llm,
+        llm: Union[ChatLiteLLM, FakeListChatModel] = llm,
     ) -> Union[dict, List]:
         pbar = llm.callbacks[0].pbar[-1]
         pbar.update(pbar.total - pbar.n)