[NeuralChat] Add readme, add content length filter, fix build error (intel#1378)

XuhuiRen · web-flow · commit 7c7b44b91fe7 · 2024-03-18T09:48:15.000+08:00
diff --git a/intel_extension_for_transformers/langchain/vectorstores/chroma.py b/intel_extension_for_transformers/langchain/vectorstores/chroma.py
@@ -196,7 +196,6 @@ def build(
                     client_settings=client_settings,
                     client=client,
                     collection_metadata=collection_metadata,
-                    **kwargs,
                 )
                 return chroma_collection
         else:
diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/README.md b/intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/README.md
@@ -35,16 +35,16 @@ To ensure a smooth experience, we've made sure this plugin is compatible with co
 | xlsx  | ['Questions', 'Answers']<br>['question', 'answer', 'link']<br>['context', 'link'] |
 | csv  | ['question', 'correct_answer'] |
 | json/jsonl  | {'content':xxx, 'link':xxx}|
-| txt  | / |
-| html  | / |
-| markdown  | / |
-| word  | / |
-| pdf  | / |
+| txt  | No format required |
+| html  | No format required |
+| markdown  | No format required |
+| word  | No format required |
+| pdf  | No format required |
 
 # Usage
-The most convenient way to use is this plugin is via our `build_chatbot` api as introduced in the [example code](https://github.com/intel/intel-extension-for-transformers/tree/main/intel_extension_for_transformers/neural_chat/examples/plugins/retrieval). The user could refer to it for a simple test. 
+Before using RAG in NeuralChat, please install the necessary dependencies in [requirements.txt](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/requirements.txt) to avoid the import errors. The most convenient way to use is this plugin is via our `build_chatbot` api as introduced in the [example code](https://github.com/intel/intel-extension-for-transformers/tree/main/intel_extension_for_transformers/neural_chat/examples/plugins/retrieval). The user could refer to it for a simple test. 
 
-We support multiple file formats for retrieval, including unstructured file formats such as pdf, docx, html, txt, and markdown, as well as structured file formats like jsonl and xlsx. For structured file formats, they must adhere to predefined structures.
+We support multiple file formats for retrieval, including unstructured file formats such as pdf, docx, html, txt, and markdown, as well as structured file formats like jsonl/json, csv, xlsx. For structured file formats, they must adhere to predefined structures. We also support to upload the knowledge base via a http web link.
 
 In the case of jsonl files, they should be formatted as dictionaries, such as: {'content':xxx, 'link':xxx}. The support for xlsx files is specifically designed for Question-Answer (QA) tasks. Users can input QA pairs for retrieval. Therefore, the table's header should include items labeled as "Question" and "Answer". The reference files could be found [here](https://github.com/intel/intel-extension-for-transformers/tree/main/intel_extension_for_transformers/neural_chat/assets/docs).
 
@@ -83,11 +83,15 @@ Below are the description for the available parameters in `agent_QA`,
 | embedding_model  | str | The name or path for the text embedding model |-|
 | response_template  | str | Default response when there is no available relevant documents for RAG |-|
 | mode  | str | The RAG behavior for different use case. Please check [here](#rag-mode) |"accuracy", "general"|
-| retrieval_type   | str | The type of the retriever. Please check [here](#retrievers) for more details  | "default", "child_parent"|
+| retrieval_type   | str | The type of the retriever. Please check [here](#retrievers) for more details  | "default", "child_parent", "bm25"|
 | process  | bool | Whether to split the long documents into small chucks. The size of each chuck is defined by `max_chuck_size` and `min_chuck_size`|True, False|
 | max_chuck_size  | int | The max token length for a single chuck in the knowledge base |-|
 | min_chuck_size  | int | The min token length for a single chuck in the knowledge base |-|
 | append  | bool | Whether the new knowledge will be append to the existing knowledge base or directly load the existing knowledge base |True, False|
+| polish  | bool | Whether to polish the input query before processing |True, False|
+| enable_rerank   | bool | Whether to enable retrieval then rerank pipeline |True, False|
+| reranker_model   | str | The name of the reranker model from the Huggingface or a local path |-|
+| top_n   | int | The return number of the reranker model |-|
 
 More retriever- and vectorstore-related parameters please check [here](#langchain-extension)
 
@@ -185,17 +189,17 @@ plugins.retrieval.args["search_kwargs"]=xxx
 ```
 
 If "search_type"="similarity":
->search_kwargs={"k"=xxx}
+>search_kwargs={"k":xxx}
 
 "k" is the number of the returned most similar documents.
 
 If "search_type"="mmr":
->search_kwargs={"k"=xxx, "fetch_k"=xxx, "lamabda_mult"=xxx}
+>search_kwargs={"k":xxx, "fetch_k":xxx, "lamabda_mult":xxx}
 
 "k" is the number of the returned most similar documents. "fetch_k" is the number of Documents to fetch to pass to MMR algorithm. "Lamabda_mult" is a number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5.
 
 If "search_type"="similarity_score_threshold":
->search_kwargs={"k"=xxx, "score_threshold"=xxx}
+>search_kwargs={"k":xxx, "score_threshold":xxx}
 
 "k" is the number of the returned most similar documents. "score_threshold" is the similar score threshold for the retrieved documents.
 
diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/retrieval_agent.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/retrieval_agent.py
@@ -39,14 +39,15 @@
     level=logging.INFO
 )
 
-def document_transfer(data_collection):
+def document_transfer(data_collection, min_length):
     "Transfer the raw document into langchain supported format."
     documents = []
     for data, meta in data_collection:
-        doc_id = str(uuid.uuid4())
-        metadata = {"source": meta, "identify_id":doc_id}
-        doc = Document(page_content=data, metadata=metadata)
-        documents.append(doc)
+        if len(data) > min_length:
+            doc_id = str(uuid.uuid4())
+            metadata = {"source": meta, "identify_id":doc_id}
+            doc = Document(page_content=data, metadata=metadata)
+            documents.append(doc)
     return documents
 
 def document_append_id(documents):
@@ -84,6 +85,7 @@ def __init__(self,
         self.mode = mode
         self.process = process
         self.retriever = None
+        self.min_chuck_size = min_chuck_size
         self.splitter = RecursiveCharacterTextSplitter(chunk_size= kwargs['child_size'] \
                     if 'child_size' in kwargs else 512)
         allowed_retrieval_type: ClassVar[Collection[str]] = (
@@ -162,7 +164,7 @@ def __init__(self,
         data_collection = self.document_parser.load(input=self.input_path, **kwargs)
         logging.info("The parsing for the uploaded files is finished.")
 
-        langchain_documents = document_transfer(data_collection)
+        langchain_documents = document_transfer(data_collection, self.min_chuck_size)
         logging.info("The format of parsed documents is transferred.")
 
         if self.vector_database == "Chroma":
@@ -235,7 +237,7 @@ def create(self, input_path, **kwargs):
         Create a new knowledge base based on the uploaded files.
         """
         data_collection = self.document_parser.load(input=input_path, **kwargs)
-        langchain_documents = document_transfer(data_collection)
+        langchain_documents = document_transfer(data_collection, self.min_chuck_size)
 
         if self.retrieval_type == 'default':
             knowledge_base = self.database.from_documents(documents=langchain_documents, \
@@ -261,7 +263,7 @@ def append_localdb(self, append_path, **kwargs):
         "Append the knowledge instances into a given knowledge base."
 
         data_collection = self.document_parser.load(input=append_path, **kwargs)
-        langchain_documents = document_transfer(data_collection)
+        langchain_documents = document_transfer(data_collection, self.min_chuck_size)
 
         if self.retrieval_type == 'default':
             knowledge_base = self.database.from_documents(documents=langchain_documents, \
diff --git a/intel_extension_for_transformers/neural_chat/prompts/prompt.py b/intel_extension_for_transformers/neural_chat/prompts/prompt.py
@@ -133,7 +133,7 @@
          - Please refer to the search results obtained from the local knowledge base. But be careful to not \
          incorporate the information that you think is not relevant to the question.
          - If you don't know the answer to a question, please don't share false information.\n""" ,
-        roles=("### Question:", "### Search Results:", "### Chat History:", "### Response:"),
+        roles=("### Question: ", "### Search Results: ", "### Chat History: ", "### Response: "),
         sep_style=SeparatorStyle.NO_COLON_SINGLE,
         sep="\n",
     )
@@ -145,7 +145,7 @@
         name="rag_without_context",
         system_message="Have a conversation with a human. " + \
             "You are required to generate suitable response to the user input.\n",
-        roles=("### Input:", "### Response:"),
+        roles=("### Input: ", "### Response: "),
         sep_style=SeparatorStyle.NO_COLON_SINGLE,
         sep="\n",
     )
@@ -157,7 +157,7 @@
         name="rag_without_context_memory",
         system_message="Have a conversation with a human. " + \
             "You are required to generate suitable response to the user input.\n",
-        roles=("### Input:", "### Chat History:", "### Response:"),
+        roles=("### Input: ", "### Chat History: ", "### Response: "),
         sep_style=SeparatorStyle.NO_COLON_SINGLE,
         sep="\n",
     )
@@ -172,7 +172,7 @@
          - Please refer to the search results obtained from the local knowledge base. But be careful to not \
          incorporate the information that you think is not relevant to the question.
          - If you don't know the answer to a question, please don't share false information.\n""",
-        roles=("### Question:", "### Search Results:", "### Chat History:", "### Response:"),
+        roles=("### Question: ", "### Search Results: ", "### Chat History: ", "### Response: "),
         sep_style=SeparatorStyle.NO_COLON_SINGLE,
         sep="\n",
     )
diff --git a/intel_extension_for_transformers/neural_chat/tests/ci/plugins/retrieval/test_parameters.py b/intel_extension_for_transformers/neural_chat/tests/ci/plugins/retrieval/test_parameters.py
@@ -321,7 +321,7 @@ def test_false_process(self):
         plugins.retrieval.args["input_path"] = "../assets/docs/sample_1.txt"
         plugins.retrieval.args["persist_directory"] = "./false_process"
         plugins.retrieval.args["retrieval_type"] = 'default'
-        plugins.retrieval.args["min_chuck_size"] = 100
+        plugins.retrieval.args["min_chuck_size"] = 10
         plugins.retrieval.args["max_chuck_size"] = 150
         plugins.retrieval.args["process"] = False
         config = PipelineConfig(model_name_or_path="facebook/opt-125m",

Original file line number	Diff line number	Diff line change
`@@ -196,7 +196,6 @@ def build(`
`196`	`196`	`client_settings=client_settings,`
`197`	`197`	`client=client,`
`198`	`198`	`collection_metadata=collection_metadata,`
`199`		`- **kwargs,`
`200`	`199`	`)`
`201`	`200`	`return chroma_collection`
`202`	`201`	`else:`