diff --git a/.github/actions/setup-docker/action.yaml b/.github/actions/setup-docker/action.yaml new file mode 100644 index 0000000..1278b19 --- /dev/null +++ b/.github/actions/setup-docker/action.yaml @@ -0,0 +1,22 @@ +name: "Set Up Docker" +description: "Set up Docker" + +inputs: + docker-username: + description: "DockerHub username" + required: true + docker-password: + description: "DockerHub password" + required: true + +runs: + using: "composite" + steps: + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log into DockerHub + uses: docker/login-action@v3 + with: + username: ${{ inputs.docker-username }} + password: ${{ inputs.docker-password }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..4cd3cc4 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,77 @@ +name: Build and Release Images + +on: + push: + tags: + - v* + workflow_dispatch: + inputs: + tags: + description: 'Tags' +env: + AWS_REGION: "us-east-1" + +permissions: + contents: read + pull-requests: read + repository-projects: read + +jobs: + release-images: + runs-on: ubuntu-latest + steps: + - name: Check permissions using GitHub CLI + env: + GH_TOKEN: ${{ github.token }} + run: | + permission=$(gh api repos/${{ github.repository }}/collaborators/${{ github.actor }}/permission --jq '.permission') + if [ "$permission" = "admin" ]; then + echo "Has admin access" + # Your workflow steps here + else + echo "Permission denied" + exit 1 + fi + + - name: Remove software and language runtimes we're not using + run: | + sudo rm -rf /usr/share/swift + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/share/powershell + sudo rm -rf /usr/local/share/chromium + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/local/lib/node_modules + sudo rm -rf /usr/local/julia* + sudo rm -rf /opt/google/chrome + df . -h + + - name: Check out repository + uses: actions/checkout@v2 + with: + fetch-depth: '0' + + - name: Fetch all tags + run: git fetch origin +refs/tags/*:refs/tags/* + + - name: Set up Docker + uses: ./.github/actions/setup-docker + with: + docker-username: ${{ secrets.DOCKER_USERNAME }} + docker-password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Set tag + run: | + TAG=$(git describe --tags --match "v*" --abbrev=0) + echo "TAG=$TAG" >> $GITHUB_ENV + + - name: Build and push image - createvectordb + run: | + ./dockers/llm.vdb.service/makeDocker.sh elotl/createvectordb ${{ env.TAG }} + + - name: Build and push image - llm-chat + run: | + ./dockers/llm.chatui.service/makeDocker.sh elotl/llm-chat ${{ env.TAG }} + + - name: Build and push image - serveragllm + run: | + ./dockers/llm.rag.service/makeDocker.sh elotl/serveragllm ${{ env.TAG }} diff --git a/README.md b/README.md index c96212a..b951db6 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,40 @@ -# k8s-rag-llm -Deployment of RAG + LLM model serving on multiple K8s cloud clusters +# Question-Answer Chatbot with Self-hosted LLMs & RAG + +- Setup the complete infrastructure stack for a Question-Answer chatbot for your private data in just a few minutes! +- Your stack will be powered by Self-hosted Open-Source Large Language Models and Retrieval Augmented Generation running on Kubernetes Cloud clusters. + +## Overview + +The Question-Answer Chatbot is powered by these technologies: + +1. Open-Source [Large Language Models](https://en.wikipedia.org/wiki/Large_language_model) +2. [Retrieval Augmented Generation (RAG)](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) +3. [Vector Stores](https://en.wikipedia.org/wiki/Vector_database) +4. [Ray AI/ML compute framework](https://www.ray.io/) +5. [Elotl Luna](https://www.elotl.co/luna.html) + +elotl_genai_stack_enduser + +## Retrieval Augmented Generation + +The graphic below shows how RAG is used to determine an answer to the end-user's question about a specific knowledge base. + +
+elotl_genai_stack_enduser +
+ +## Installation + +* [Cluster Setup Summary](docs/install.md#cluster-setup-summary) +* [Install Infrastructure Tools](docs/install.md#install-infrastructure-tools) +* [Install Model Serve Stack](docs/install.md#install-model-serve-stack) +* [Model Serving](docs/install.md#model-serve) +* [Retrieval Augmented Generation using FAISS](docs/install.md#retrieval-augmented-generation-rag-using-faiss) +* [Creation of the Vector Store](docs/install.md#creation-of-the-vector-store) +* [Install the RAG & LLM querying service](docs/install.md#setup-rag--llm-service) +* [Send a question to your LLM with RAG](docs/install.md#query-the-llm-with-rag) +* [Query your LLM with RAG using a Chat UI](docs/install.md#query-the-llm-with-rag-using-a-chat-ui) +* [Uninstall](docs/install.md#uninstall) + +Jump to complete install doc available [here](docs/install.md). + diff --git a/demo/llm.chatui.service/auth-proxy.yml b/demo/llm.chatui.service/auth-proxy.yml new file mode 100644 index 0000000..d1a45a3 --- /dev/null +++ b/demo/llm.chatui.service/auth-proxy.yml @@ -0,0 +1,93 @@ +# nginx-auth-proxy-config.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: nginx-auth-proxy-config +data: + nginx.conf: | + events { + worker_connections 1024; + } + http { + server { + listen 80; + + location / { + auth_basic "Restricted Access"; + auth_basic_user_file /etc/nginx/auth/.htpasswd; + + proxy_pass http://simple-chat-service.default.svc.cluster.local:7860; # Points to our simple chat service + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + } + } + +--- +# auth-secret.yaml +apiVersion: v1 +kind: Secret +metadata: + name: auth-proxy-credentials +type: Opaque +data: + # Generated using: htpasswd -c .htpasswd username + # Then base64 encode the file content + # htpasswd -c .htpasswd your_chosen_username + # cat .htpasswd | base64 + # myuser:elotl + + .htpasswd: ZWxvdGw6JGFwcjEkRmtKeUFMWjMkYjd5WXdBdmhHbmtTSjN2QTdCOXlGMAo= + +--- +# auth-proxy-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: auth-proxy +spec: + replicas: 2 # For high availability + selector: + matchLabels: + app: auth-proxy + template: + metadata: + labels: + app: auth-proxy + spec: + volumes: + - name: nginx-config + configMap: + name: nginx-auth-proxy-config + - name: auth-volume + secret: + secretName: auth-proxy-credentials + containers: + - name: nginx + image: nginx:alpine + ports: + - containerPort: 80 + volumeMounts: + - name: nginx-config + mountPath: /etc/nginx/nginx.conf + subPath: nginx.conf + - name: auth-volume + mountPath: /etc/nginx/auth + readOnly: true + +--- +# auth-proxy-service.yaml +apiVersion: v1 +kind: Service +metadata: + name: auth-proxy-service +spec: + type: LoadBalancer + ports: + - port: 80 + targetPort: 80 + protocol: TCP + selector: + app: auth-proxy diff --git a/demo/llm.chatui.service/pv-and-pvc.yaml b/demo/llm.chatui.service/pv-and-pvc.yaml new file mode 100644 index 0000000..02e2ade --- /dev/null +++ b/demo/llm.chatui.service/pv-and-pvc.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: simple-chat-pv +spec: + capacity: + storage: 20Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + hostPath: + path: /mnt/data/simple-chat-logs +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: simple-chat-pvc +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 20Gi diff --git a/demo/llm.chatui.service/simple-chat.yaml b/demo/llm.chatui.service/simple-chat.yaml new file mode 100644 index 0000000..e48917d --- /dev/null +++ b/demo/llm.chatui.service/simple-chat.yaml @@ -0,0 +1,57 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: simple-chat + labels: + app: simple-chat +spec: + replicas: 1 + selector: + matchLabels: + app: simple-chat + template: + metadata: + labels: + app: simple-chat + elotl-luna: "true" + annotations: + node.elotl.co/instance-type-regexp: "^(t3.xlarge|n2-standard-4)$" + spec: + containers: + - name: chat + image: elotl/llm-chat:v1.3.12 + imagePullPolicy: Always + ports: + - containerPort: 7860 + env: + - name: RAG_LLM_QUERY_URL + value: "http://serveragllm-service.default.svc.cluster.local:8000" + - name: USE_CHATBOT_HISTORY + value: "True" + resources: + requests: + cpu: "200m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "512Mi" + volumeMounts: + - name: log-storage + mountPath: /app/logs + volumes: + - name: log-storage + persistentVolumeClaim: + claimName: simple-chat-pvc +--- +apiVersion: v1 +kind: Service +metadata: + name: simple-chat-service +spec: + selector: + app: simple-chat + ports: + - protocol: TCP + port: 7860 + targetPort: 7860 + type: ClusterIP diff --git a/demo/llm.gpu.service/block_device_mapping.json b/demo/llm.gpu.service/block_device_mapping.json index 6b60822..16665a1 100644 --- a/demo/llm.gpu.service/block_device_mapping.json +++ b/demo/llm.gpu.service/block_device_mapping.json @@ -1,6 +1,6 @@ [ { - "DeviceName": "/dev/xvda", + "DeviceName": "/dev/xvda", "Ebs": { "DeleteOnTermination": true, "VolumeSize": 80, diff --git a/demo/llm.gpu.service/block_device_mapping_bottlerocket.json b/demo/llm.gpu.service/block_device_mapping_bottlerocket.json new file mode 100644 index 0000000..862cb0a --- /dev/null +++ b/demo/llm.gpu.service/block_device_mapping_bottlerocket.json @@ -0,0 +1,21 @@ +[ + { + "DeviceName": "/dev/xvda", + "Ebs": { + "DeleteOnTermination": true, + "VolumeSize": 80, + "VolumeType": "gp3", + "Encrypted": false + } + }, + { + "DeviceName": "/dev/xvdb", + "Ebs": { + "DeleteOnTermination": true, + "VolumeSize": 80, + "VolumeType": "gp3", + "Encrypted": false, + "SnapshotId": "snap-09946d545033d96f7" + } + } +] diff --git a/demo/llm.gpu.service/get-user-data.sh b/demo/llm.gpu.service/get-user-data.sh new file mode 100755 index 0000000..b28f185 --- /dev/null +++ b/demo/llm.gpu.service/get-user-data.sh @@ -0,0 +1,4 @@ +clustername=$1 +region=$2 +eksctl get cluster --region $region --name $clustername -o json \ + | jq --raw-output '.[] | "settings.kubernetes.api-server = \"" + .Endpoint + "\"\nsettings.kubernetes.cluster-certificate =\"" + .CertificateAuthority.Data + "\"\n"' > user-data.toml diff --git a/demo/llm.rag.service/chat-serveragllmpluslb.yaml b/demo/llm.rag.service/chat-serveragllmpluslb.yaml new file mode 100644 index 0000000..92f93fe --- /dev/null +++ b/demo/llm.rag.service/chat-serveragllmpluslb.yaml @@ -0,0 +1,89 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: serveragllm-deployment + labels: + app: modelragllmserve +spec: + replicas: 1 + selector: + matchLabels: + model: serveragllm + template: + metadata: + labels: + model: serveragllm + elotl-luna: "true" + annotations: + node.elotl.co/instance-type-regexp: "^(t3.xlarge|n2-standard-4)$" + spec: + containers: + - name: serveragllm + image: elotl/serveragllm:testweaviatesql3 + imagePullPolicy: Always + ports: + - containerPort: 8000 + resources: + requests: + cpu: "1.5" + memory: "1G" + env: + - name: MODEL_LLM_SERVER_URL + value: ${MODEL_LLM_SERVER_URL} + - name: AWS_ACCESS_KEY_ID + value: ${AWS_ACCESS_KEY_ID} + - name: AWS_SECRET_ACCESS_KEY + value: ${AWS_SECRET_ACCESS_KEY} + - name: VECTOR_DB_S3_BUCKET + value: ${VECTOR_DB_S3_BUCKET} + - name: VECTOR_DB_S3_FILE + value: ${VECTOR_DB_S3_FILE} + - name: SYSTEM_PROMPT + value: ${SYSTEM_PROMPT} + - name: MODEL_ID + value: ${MODEL_ID} + - name: MAX_TOKENS + value: ${MAX_TOKENS} + - name: MODEL_TEMPERATURE + value: ${MODEL_TEMPERATURE} + - name: RELEVANT_DOCS + value: ${RELEVANT_DOCS} + - name: IS_JSON_MODE + value: "${IS_JSON_MODE}" + - name: SEARCH_TYPE + value: ${SEARCH_TYPE} + - name: WEAVIATE_URI_WITH_PORT + value: ${WEAVIATE_URI_WITH_PORT} + - name: WEAVIATE_GRPC_URI_WITH_PORT + value: ${WEAVIATE_GRPC_URI_WITH_PORT} + - name: WEAVIATE_INDEX_NAME + value: ${WEAVIATE_INDEX_NAME} + - name: SQL_SEARCH_DB_AND_MODEL_PATH + value: ${SQL_SEARCH_DB_AND_MODEL_PATH} + volumeMounts: + - name: log-storage + mountPath: /app/logs + - name: db-storage + mountPath: /app/db + volumes: + - name: log-storage + persistentVolumeClaim: + claimName: rag-llm-pvc + - name: db-storage + persistentVolumeClaim: + claimName: sqldb-s3-pvc +--- +apiVersion: v1 +kind: Service +metadata: + name: serveragllm-service + labels: + app: modelragllmserve +spec: + type: ClusterIP + selector: + model: serveragllm + ports: + - name: http + port: 8000 + targetPort: 8000 diff --git a/demo/llm.rag.service/pv-and-pvc.yaml b/demo/llm.rag.service/pv-and-pvc.yaml new file mode 100644 index 0000000..fedfb45 --- /dev/null +++ b/demo/llm.rag.service/pv-and-pvc.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: rag-llm-pv +spec: + capacity: + storage: 20Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + hostPath: + path: /mnt/data/rag-llm-logs +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: rag-llm-pvc +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 20Gi diff --git a/demo/llm.rag.service/qa-db-s3-pv-pvc.yaml b/demo/llm.rag.service/qa-db-s3-pv-pvc.yaml new file mode 100644 index 0000000..62c840c --- /dev/null +++ b/demo/llm.rag.service/qa-db-s3-pv-pvc.yaml @@ -0,0 +1,35 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: sqldb-s3-pv +spec: + capacity: + storage: 5Gi # Ignored, required + accessModes: + - ReadWriteMany # Supported options: ReadWriteMany / ReadOnlyMany + storageClassName: "" # Required for static provisioning + claimRef: # To ensure no other PVCs can claim this PV + namespace: default # Namespace is required even though it's in "default" namespace. + name: sqldb-s3-pvc # Name of PVC + mountOptions: + - allow-delete + - region us-west-2 + - prefix zendesk/ + csi: + driver: s3.csi.aws.com # Required + volumeHandle: s3-csi-driver-volume + volumeAttributes: + bucketName: selvi-txt-to-sql-db # replace with your bucket name +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: sqldb-s3-pvc +spec: + accessModes: + - ReadWriteMany # Supported options: ReadWriteMany / ReadOnlyMany + storageClassName: "" # Required for static provisioning + resources: + requests: + storage: 5Gi # Ignored, required + volumeName: sqldb-s3-pv # Name of your PV diff --git a/demo/llm.rag.service/rag-serveragllmpluslb.yaml b/demo/llm.rag.service/rag-serveragllmpluslb.yaml new file mode 100644 index 0000000..29a932e --- /dev/null +++ b/demo/llm.rag.service/rag-serveragllmpluslb.yaml @@ -0,0 +1,74 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: serveragllm-deployment + labels: + app: modelragllmserve +spec: + replicas: 1 + selector: + matchLabels: + model: serveragllm + template: + metadata: + labels: + model: serveragllm + elotl-luna: "true" + annotations: + node.elotl.co/instance-type-regexp: "^(t3.xlarge|n2-standard-4)$" + spec: + containers: + - name: serveragllm + image: elotl/serveragllm:v1.3.12 + imagePullPolicy: Always + ports: + - containerPort: 8000 + resources: + requests: + cpu: "1.5" + memory: "1G" + env: + - name: MODEL_LLM_SERVER_URL + value: ${MODEL_LLM_SERVER_URL} + - name: AWS_ACCESS_KEY_ID + value: ${AWS_ACCESS_KEY_ID} + - name: AWS_SECRET_ACCESS_KEY + value: ${AWS_SECRET_ACCESS_KEY} + - name: VECTOR_DB_S3_BUCKET + value: ${VECTOR_DB_S3_BUCKET} + - name: VECTOR_DB_S3_FILE + value: ${VECTOR_DB_S3_FILE} + - name: SYSTEM_PROMPT + value: ${SYSTEM_PROMPT} + - name: MODEL_ID + value: ${MODEL_ID} + - name: MAX_TOKENS + value: ${MAX_TOKENS} + - name: MODEL_TEMPERATURE + value: ${MODEL_TEMPERATURE} + - name: RELEVANT_DOCS + value: ${RELEVANT_DOCS} + - name: IS_JSON_MODE + value: "${IS_JSON_MODE}" + volumeMounts: + - name: log-storage + mountPath: /app/logs + volumes: + - name: log-storage + persistentVolumeClaim: + claimName: rag-llm-pvc +--- +apiVersion: v1 +kind: Service +metadata: + name: serveragllm-service + labels: + app: modelragllmserve +spec: + type: ClusterIP + selector: + model: serveragllm + ports: + - name: http + port: 8000 + targetPort: 8000 diff --git a/demo/llm.vdb.service/createvdb.yaml b/demo/llm.vdb.service/createvdb.yaml new file mode 100644 index 0000000..a7f0948 --- /dev/null +++ b/demo/llm.vdb.service/createvdb.yaml @@ -0,0 +1,49 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: createvectordb + labels: + app: modeldataingest +spec: + ttlSecondsAfterFinished: 120 + template: + metadata: + labels: + elotl-luna: "true" + annotations: + node.elotl.co/instance-type-regexp: "^(t3.xlarge|n2-standard-4)$" + spec: + restartPolicy: Never + containers: + - name: createvectordb + image: elotl/createvectordb:testWeaviateSql1 + imagePullPolicy: Always + resources: + requests: + cpu: "1.5" + memory: "1G" + env: + - name: AWS_ACCESS_KEY_ID + value: ${AWS_ACCESS_KEY_ID} + - name: AWS_SECRET_ACCESS_KEY + value: ${AWS_SECRET_ACCESS_KEY} + - name: VECTOR_DB_INPUT_TYPE + value: ${VECTOR_DB_INPUT_TYPE} + - name: VECTOR_DB_INPUT_ARG + value: ${VECTOR_DB_INPUT_ARG} + - name: VECTOR_DB_S3_BUCKET + value: ${VECTOR_DB_S3_BUCKET} + - name: VECTOR_DB_S3_FILE + value: ${VECTOR_DB_S3_FILE} + - name: EMBEDDING_CHUNK_SIZE + value: ${EMBEDDING_CHUNK_SIZE} + - name: EMBEDDING_CHUNK_OVERLAP + value: ${EMBEDDING_CHUNK_OVERLAP} + - name: EMBEDDING_MODEL_NAME + value: ${EMBEDDING_MODEL_NAME} + - name: WEAVIATE_URI_WITH_PORT + value: ${WEAVIATE_URI_WITH_PORT} + - name: WEAVIATE_GRPC_URI_WITH_PORT + value: ${WEAVIATE_GRPC_URI_WITH_PORT} + - name: WEAVIATE_INDEX_NAME + value: ${WEAVIATE_INDEX_NAME} diff --git a/demo/weaviate/values.yaml b/demo/weaviate/values.yaml new file mode 100644 index 0000000..ebead00 --- /dev/null +++ b/demo/weaviate/values.yaml @@ -0,0 +1,36 @@ +# The service controls how weaviate is exposed to the outside world. If you +# don't want a public load balancer, you can also choose 'ClusterIP' to make +# weaviate only accessible within your cluster. +service: + name: weaviate + ports: + - name: http + protocol: TCP + port: 80 + # Target port is going to be the same for every port + type: ClusterIP + annotations: {} + +# The service controls how weaviate gRPC endpoint is exposed to the outside world. +# If you don't want a public load balancer, you can also choose 'ClusterIP' or `NodePort` +# to make weaviate gRPC port be only accessible within your cluster. +# This service is by default enabled but if you don't want it to be deployed in your +# environment then it can be disabled by setting enabled: false option. +grpcService: + enabled: true + name: weaviate-grpc + ports: + - name: grpc + protocol: TCP + port: 50051 + # Target port is going to be the same for every port + type: ClusterIP + annotations: {} + +## The Persistent Volume Claim settings for Weaviate. If there's a +## storage.fullnameOverride field set, then the default pvc will not be +## created, instead the one defined in fullnameOverride will be used +#storage: +# fullnameOverride: "weaviate-pvc" +# size: 32Gi +# storageClassName: "" diff --git a/diagrams/elotl_genai_infrastack.png b/diagrams/elotl_genai_infrastack.png new file mode 100644 index 0000000..a419023 Binary files /dev/null and b/diagrams/elotl_genai_infrastack.png differ diff --git a/diagrams/elotl_genai_stack_enduser.png b/diagrams/elotl_genai_stack_enduser.png new file mode 100644 index 0000000..37db3ce Binary files /dev/null and b/diagrams/elotl_genai_stack_enduser.png differ diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..18b6ae0 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,13 @@ +services: + weaviate: + image: cr.weaviate.io/semitechnologies/weaviate:1.28.2 + ports: + - 8080:8080 + - 50051:50051 + restart: on-failure:0 + environment: + QUERY_DEFAULTS_LIMIT: 25 + AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' + PERSISTENCE_DATA_PATH: '/var/lib/weaviate' + ENABLE_API_BASED_MODULES: 'true' + CLUSTER_HOSTNAME: 'node1' \ No newline at end of file diff --git a/dockers/llm.chatui.service/Dockerfile b/dockers/llm.chatui.service/Dockerfile new file mode 100644 index 0000000..a06ba79 --- /dev/null +++ b/dockers/llm.chatui.service/Dockerfile @@ -0,0 +1,29 @@ +# syntax=docker/dockerfile:1 +FROM python:3.11-slim AS base-container + +# Automatically set by buildx +ARG TARGETPLATFORM + +ENV DEBIAN_FRONTEND=noninteractive + +# Install only essential dependencies, clean up after install to reduce image size +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3-pip && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +WORKDIR /simple_chat + +# Copy application code and requirements.txt for dependency installation +COPY simple_chat.py . +COPY requirements.txt . + +# Install dependencies +RUN --mount=type=cache,target=/root/.cache/pip \ + pip3 install --upgrade pip && \ + pip3 install --no-cache-dir -r requirements.txt + +# Expose the Gradio port +EXPOSE 7860 + +# Start the application +CMD ["python", "simple_chat.py"] \ No newline at end of file diff --git a/dockers/llm.chatui.service/makeDocker.sh b/dockers/llm.chatui.service/makeDocker.sh new file mode 100755 index 0000000..4bb6cc0 --- /dev/null +++ b/dockers/llm.chatui.service/makeDocker.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +set -e + +#set -x + +SIMPLE_RAG_LLM_CHAT_REPO=$1 +SIMPLE_RAG_LLM_CHAT_TAG=$2 + +# Get the directory where the script is located +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )" + +echo "" +echo "Building docker for rag chat ui" +docker buildx build --platform=linux/amd64 --load \ + -f "${SCRIPT_DIR}/Dockerfile" \ + -t ${SIMPLE_RAG_LLM_CHAT_REPO}:${SIMPLE_RAG_LLM_CHAT_TAG} \ + "${SCRIPT_DIR}" + +docker push ${SIMPLE_RAG_LLM_CHAT_REPO}:${SIMPLE_RAG_LLM_CHAT_TAG} diff --git a/dockers/llm.chatui.service/requirements-dev.txt b/dockers/llm.chatui.service/requirements-dev.txt new file mode 100644 index 0000000..b974f14 --- /dev/null +++ b/dockers/llm.chatui.service/requirements-dev.txt @@ -0,0 +1,3 @@ +black +isort +mypy \ No newline at end of file diff --git a/dockers/llm.chatui.service/requirements.txt b/dockers/llm.chatui.service/requirements.txt new file mode 100644 index 0000000..6d23f9b --- /dev/null +++ b/dockers/llm.chatui.service/requirements.txt @@ -0,0 +1,2 @@ +gradio +requests diff --git a/dockers/llm.chatui.service/simple_chat.py b/dockers/llm.chatui.service/simple_chat.py new file mode 100644 index 0000000..e8c86a8 --- /dev/null +++ b/dockers/llm.chatui.service/simple_chat.py @@ -0,0 +1,211 @@ +import logging +import os +import sys +import urllib +from logging.handlers import TimedRotatingFileHandler + +import gradio as gr +import requests + +# When running locally: export CHATUI_LOGS_PATH=logs/chatui.log +log_file_path = os.getenv("CHATUI_LOGS_PATH") or "/app/logs/chatui.log" +os.makedirs( + os.path.dirname(log_file_path), exist_ok=True +) # Ensure log directory exists +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + handlers=[ + # Log to file, rotate every 1H and store files from last 24 hrs * 7 days files == 168H data + TimedRotatingFileHandler(log_file_path, when="h", interval=1, backupCount=168), + logging.StreamHandler(), # Also log to console + ], +) + +# Environment variable setup +RAG_LLM_QUERY_URL = os.getenv("RAG_LLM_QUERY_URL") + +if RAG_LLM_QUERY_URL is None: + logging.error( + "Please set the environment variable, RAG_LLM_QUERY_URL (to the IP of the RAG + LLM endpoint)" + ) + sys.exit(1) + +logging.info(f"RAG query endpoint, RAG_LLM_QUERY_URL: {RAG_LLM_QUERY_URL}") + +USE_CHATBOT_HISTORY = os.getenv("USE_CHATBOT_HISTORY", "True") == "True" + +logging.info(f"Use history {USE_CHATBOT_HISTORY}") + + +def clean_answer(text: str, chatml_end_token: str) -> str: + """ + Remove all content after and including the specified token from the text. + Args: + text (str): The input text to clean + chatml_end_token (str): The token after which all content should be removed + Returns: + str: Cleaned text with all content after and including the token removed + """ + if not text: # Handle empty text + return "" + + if not chatml_end_token: # Handle empty end tokens + return text + + # Split text at the token and take only the content before it + if chatml_end_token in text: + text = text.split(chatml_end_token, 1)[0] + logging.info(f"Cleaned text before chatml_end_token: {text}") + + return text.strip() + + +# Function to generate clickable links for JIRA tickets +def generate_source_links(sources): + links = [] + for source in sources: + links.append(f'{source}') + return links + + +# Function to fetch the response from the RAG+LLM API +def get_api_response(user_message): + try: + question = urllib.parse.quote(f"{user_message}") + response = requests.get(f"{RAG_LLM_QUERY_URL}/answer/{question}") + if response.status_code == 200: + result = response.json() + + if "answer" not in result.keys(): + return "Could not fetch response." + + result = result["answer"] + answer = clean_answer( + result.get("answer", "Could not fetch response."), "<|im_end|>" + ) + sources = result.get("sources", []) + links = generate_source_links(sources) + clickable_links = "
".join(links) + context = result.get("context", "") + logging.info(f"Question: {question}\nAnswer: {answer}\nContext: {context}") + + return f"{answer}

Relevant Tickets:
{clickable_links}" + else: + return "API Error: Unable to fetch response." + except requests.RequestException: + return "API Error: Failed to connect to the backend service." + + +# Chatbot response functions +def chatbot_response_no_hist(_chatbot, user_message): + response_text = get_api_response(user_message) + return ( + [[user_message, response_text]], + "", + gr.update(value=1, visible=True), + gr.update(visible=True), + user_message, + response_text, + ) + + +def chatbot_response(history, user_message): + response_text = get_api_response(user_message) + history.append((user_message, response_text)) + return ( + history, + "", + gr.update(value=1, visible=True), + gr.update(visible=True), + user_message, + response_text, + ) + + +def submit_rating(rating, user_message, bot_response): + logging.info( + f"User rating: {rating}\nQuestion: {user_message}\nAnswer: {bot_response}" + ) + # Hide the rating slider and submit button after submission + return gr.update(visible=False), gr.update(visible=False) + + +# In the Gradio UI setup section, change: +with gr.Blocks() as app: + with gr.Row(): + with gr.Column(scale=4): + # Change from chatbot = gr.Chatbot() to: + chatbot = gr.Chatbot(label="Question-Answering Chatbot", height=600, resizable=True) + + # Rating slider and submit button initially hidden + rating_slider = gr.Slider( + label="Rate the response", minimum=1, maximum=5, step=1, visible=False + ) + submit_rating_btn = gr.Button("Submit Rating", visible=False) + + msg = gr.Textbox(placeholder="Type your question here...", label="Question") + send_button = gr.Button("Send") + # Hidden variables to hold user_message and bot_response for rating submission + user_message = gr.State() + bot_response = gr.State() + + if USE_CHATBOT_HISTORY: + msg.submit( + chatbot_response, + inputs=[chatbot, msg], + outputs=[ + chatbot, + msg, + rating_slider, + submit_rating_btn, + user_message, + bot_response, + ], + ) + send_button.click( + chatbot_response, + inputs=[chatbot, msg], + outputs=[ + chatbot, + msg, + rating_slider, + submit_rating_btn, + user_message, + bot_response, + ], + ) + else: + msg.submit( + chatbot_response_no_hist, + inputs=[chatbot, msg], + outputs=[ + chatbot, + msg, + rating_slider, + submit_rating_btn, + user_message, + bot_response, + ], + ) + send_button.click( + chatbot_response_no_hist, + inputs=[chatbot, msg], + outputs=[ + chatbot, + msg, + rating_slider, + submit_rating_btn, + user_message, + bot_response, + ], + ) + + # Handle rating submission with the button + submit_rating_btn.click( + submit_rating, + inputs=[rating_slider, user_message, bot_response], + outputs=[rating_slider, submit_rating_btn], + ) + +app.launch(server_name="0.0.0.0") diff --git a/dockers/llm.rag.service/Dockerfile b/dockers/llm.rag.service/Dockerfile index 35f6fd4..645c945 100644 --- a/dockers/llm.rag.service/Dockerfile +++ b/dockers/llm.rag.service/Dockerfile @@ -1,13 +1,16 @@ -# syntax=docker/dockerfile-upstream:master +# syntax=docker/dockerfile:1 # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile -FROM python:3.11-slim as base-container +FROM python:3.11-slim AS base-container # Automatically set by buildx ARG TARGETPLATFORM ENV DEBIAN_FRONTEND=noninteractive -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ +# Upgrade pip +RUN pip3 install --no-cache-dir --upgrade pip setuptools wheel + +RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ ccache \ @@ -18,14 +21,34 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins WORKDIR /serveragllm +# Install dependencies in separate layers +RUN pip3 install --no-cache-dir \ + "openai" \ + "langchain" \ + "langchain_community" \ + "langchain_huggingface" \ + "unstructured" \ + "sentence-transformers" \ + "faiss-cpu" \ + "fastapi" \ + "boto3" \ + "uvicorn[standard]" \ + "weaviate-client" \ + "langchain_weaviate" \ + "langchain-community" \ + "pandas" \ + "sqlalchemy" \ + "langchain-openai" \ + "pandas" + +COPY __init__.py . +COPY proxy_app.py . COPY serveragllm.py . +COPY serverragllm_jira_cvs_local.py . +COPY serverragllm_csv_to_weaviate_local.py . +COPY common.py . COPY pyproject.toml . -RUN --mount=type=cache,target=/root/.cache/pip \ - pip3 install -v --no-cache-dir \ - "openai" "langchain" "sentence-transformers" "faiss-cpu" "uvicorn[standard]" "fastapi" "boto3" && \ - pip3 install --no-cache-dir -e . - EXPOSE 8000 -CMD ["uvicorn", "serveragllm:app", "--host", "0.0.0.0", "--port", "8000"] +CMD ["python", "proxy_app.py"] diff --git a/dockers/llm.rag.service/__init__.py b/dockers/llm.rag.service/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dockers/llm.rag.service/common.py b/dockers/llm.rag.service/common.py new file mode 100644 index 0000000..7abc4b2 --- /dev/null +++ b/dockers/llm.rag.service/common.py @@ -0,0 +1,618 @@ +import logging.config +import os +import re +from enum import Enum +from typing import Any, Dict, List + +import joblib +from langchain_community.chat_models import ChatOpenAI +from langchain_community.tools.sql_database.tool import QuerySQLDataBaseTool +from langchain_community.utilities import SQLDatabase +from langchain_core.prompts import PromptTemplate +from openai import BadRequestError +from sqlalchemy import create_engine +from transformers import AutoTokenizer +from typing_extensions import Annotated, TypedDict + +LOGGING_CONFIG = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "default": { + "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s", + }, + }, + "handlers": { + "file": { + "level": "DEBUG", + "class": "logging.FileHandler", + "filename": "elotl-qa-in-a-box.log", + "formatter": "default", + }, + "stdout": { + "level": "DEBUG", + "class": "logging.StreamHandler", + "formatter": "default", + }, + }, + "loggers": { + "ElotlQAInABoxLogger": { + "handlers": ["file", "stdout"], + "level": "DEBUG", + "propagate": True, + }, + }, +} +logging.config.dictConfig(LOGGING_CONFIG) +logger = logging.getLogger("ElotlQAInABoxLogger") + + +class State(TypedDict): + question: str + query: str + result: str + + +class SearchType(Enum): + SQL = 1 + VECTOR = 2 + + +MODEL_MAX_CONTEXT_LEN = 8192 +delta = 50 # value by which we keep the prompt len less than the model context len +WEAVIATE_HYBRID_ALPHA_DEFAULT = 0.5 + + +def format_context(results: List[Dict[str, Any]]) -> str: + """Format search results into context for the LLM""" + context_parts = [] + + for result in results: + # TODO: make metadata keys configurable + ticket_metadata = result.metadata + ticket_content = result.page_content + + context_parts.append( + f"Key: {ticket_metadata['ticket']} | Status: {ticket_metadata['status']} - " + f"Type: {ticket_metadata['type']}\n" + f"Content: {ticket_content}...\n" + ) + + return "\n\n".join(context_parts) + + +def trim_answer(generated_answer: str, label_separator: str) -> str: + """ + From the generated_answer, remove all content after and including + the provided label separator + Args: + generated_answer (str): the generated answer to remove from + label_separator (str): string after which content needs to be trimmed + Note: this string will also be trimmed + Returns: + str: Cleaned answer with all content before the label separator + """ + if not generated_answer: # Handle empty text + return "" + answer = generated_answer + # Split text at the token and take only the content before it + if label_separator in generated_answer: + answer = generated_answer.split(label_separator, 1)[0] + logger.info( + f"Label separator: {label_separator} seems to have been included in the generated answer and it has been removed: {answer}" + ) + + return answer.strip() + + +# Answer user's question via vector search or RAG technique +def get_answer_with_settings( + question, retriever, client, model_id, max_tokens, model_temperature, system_prompt +): + docs = retriever.invoke(input=question) + + num_of_docs = len(docs) + logger.info( + f"Number of relevant documents retrieved and that will be used as context for query: {num_of_docs}" + ) + logger.info(f"Relevant docs retrieved from Vector store: {docs}") + + context = format_context(docs) + logger.info(f"Length of context after formatting: {len(context)}") + logger.info(f"Context after formatting: {context}") + + logger.info("Calling chat completions for JSON model...") + try: + completions = client.chat.completions.create( + model=model_id, + messages=[ + {"role": "system", "content": system_prompt}, + { + "role": "user", + "content": f"Context:\n{context}\n\nQuestion: {question}", + }, + ], + max_tokens=max_tokens, + temperature=model_temperature, + stream=False, + ) + except BadRequestError as e: + if ( + e.status_code == 400 + and "Please reduce the length of the messages or completion." + in e.body.get("message", "") + and len(docs) > 1 + ): + docs = docs[:-1] # removing last document + context = format_context(docs) + logger.info( + f"Need to decrease context - length of context after formatting: {len(context)}" + ) + try: + completions = client.chat.completions.create( + model=model_id, + messages=[ + {"role": "system", "content": system_prompt}, + { + "role": "user", + "content": f"Context:\n{context}\n\nQuestion: {question}", + }, + ], + max_tokens=max_tokens, + temperature=model_temperature, + stream=False, + ) + except Exception as e: + # Handle any error + logger.error(f"An unexpected error occurred: {e}") + errorToUI = { + "answer": f"Please try another question. Received error from LLM invocation: {e}", + "relevant_tickets": [], + "sources": [], + "context": context, + } + return errorToUI + + except Exception as e: + # Handle any error + logger.error(f"An unexpected error occurred: {e}") + errorToUI = { + "answer": f"Please try another question. Received error from LLM invocation: {e}", + "relevant_tickets": [], + "sources": [], + "context": context, + } + return errorToUI + + generated_answer = completions.choices[0].message.content + + answer = postprocess_hallucinations(generated_answer) + + sources = [r.metadata["source"] for r in docs] + unique_sources = list(set(sources)) + + tickets = [r.metadata["ticket"] for r in docs] + unique_tickets = list(set(tickets)) + + answerToUI = { + "answer": answer, + "relevant_tickets": unique_tickets, + "sources": unique_sources, + "context": context, # TODO: if this is big consider logger context here and sending some reference id to UI + } + return answerToUI + + +# Answer user's question via text-to-sql technique +def get_sql_answer( + question, + model_id, + max_tokens, + model_temperature, + llm_server_url, + sql_search_db_and_model_path, +): + + logger.info("Invoking text-to-sql question-answer search") + try: + llm = ChatOpenAI( + model=model_id, + temperature=model_temperature, + openai_api_base=llm_server_url, + openai_api_key="unused-for-self-hosted-llms", + max_tokens=max_tokens, + ) + + logger.info("Loading the pre-created SQL DB") + engine = create_engine( + "sqlite:///" + sql_search_db_and_model_path + "zendesk.db" + ) + + logger.info("Check that the SQL data can be accessed from the DB via querying") + db = SQLDatabase(engine=engine) + logger.info(f"DB dialect is: {db.dialect}") + logger.info(f"Usable table names: {db.get_usable_table_names()}") + logger.info("Table info:") + print(db.get_table_info(["zendesk"])) + + logger.info("Running sanity test SQL query:") + db.run("SELECT COUNT(*) FROM zendesk WHERE assignee_name LIKE 'John Doe';") + + # Prompt template to convert NL question to SQL + # This was manually retrieved from langchain hub and customized + query_prompt_template = prompt_template_for_text_to_sql() + + # logger.info(f"Prompt template for text-to-sql conversion: {query_prompt_template.messages[0]}") + logger.info( + f"Prompt template for text-to-sql conversion: {query_prompt_template}" + ) + + except Exception as e: + logger.error(f"An unexpected error occurred: {e}") + errorToUI = { + "answer": f"Please try another question. Received error from LLM invocation: {e}", + "relevant_tickets": [], + "sources": [], + "context": "", + } + return errorToUI + + # send SQL query and response to LLM and get a natural language answer + sql_query = write_query({"question": question}, query_prompt_template, llm, db) + state: State = { + "question": question, + "query": sql_query, + "result": execute_query(sql_query, db), + } + generated_answer = convert_sql_result_to_nl(state, model_id, llm) + answer = postprocess_hallucinations(generated_answer["answer"]) + + answerToUI = { + "answer": answer, + "relevant_tickets": ["n/a"], + "sources": ["n/a"], + "context": "", # TODO: if this is big consider logger context here and sending some reference id to UI + } + return answerToUI + + +def prompt_template_for_text_to_sql(): + + query_prompt_template = PromptTemplate.from_template( + "Given an input question, create a syntactically correct " + "{dialect} query to run to help find the answer." + "Unless the user specifies in his question a specific " + "number of examples they wish to obtain, always limit " + "your query to at most {top_k} results. You can order " + "the results by a relevant column to return the most " + "interesting examples in the database. Never query for " + "all the columns from a specific table, only ask for a " + "few relevant columns given the question. Pay attention " + "to use only the column names that you can see in the schema " + "description. Be careful to not query for columns that do " + "not exist. Also, pay attention to which column is in " + "which table. Only use the following tables: {table_info}." + "If there is a ticket ID in the question, ensure that you maintain " + "the exact ticket ID in the query." + "Question: {input}" + ) + + # Alternatively uncomment below to use prompt template from hub directly + # without customization + # query_prompt_template = hub.pull("langchain-ai/sql-query-system-prompt") + # assert len(query_prompt_template.messages) == 1 + + return query_prompt_template + + +def postprocess_hallucinations(generated_answer: str) -> str: + + # Handle common hallucinations observed: + # 1. Added-Context hallucination + # 2. Added-Question hallucination + # 3. Added-Context hallucination just labelled as "Content" (instead of Context like 1) + logger.info( + f"Removing any observed hallucinations in the generated answer: {generated_answer}" + ) + labels_to_trim = [ + "<|im_end|>", + "Context:", + "Question:", + "Content:", + "Instruction:" + "<|end_of_assistant<|im_sep|", + "<|end-user-query|>", + ] + answer = generated_answer + + for label in labels_to_trim: + if label in answer: + answer = trim_answer(answer, label) + + answer = answer.replace("<|im_start|>", "") + + logger.info(f"Answer (after cleanup): {answer}") + + return answer + + +class QueryOutput(TypedDict): + """Generated SQL query.""" + + query: Annotated[str, ..., "Syntactically valid SQL query."] + + +# Post-process LLM output to extract only the SQL query, handles both cases when +# returned output is of these forms: +# a) "sql: select title from employee limit 10" +# b) SELECT subject FROM tickets WHERE ticket_id = 685490;<|im_end|> +# <|im_start|>user> +# Question: What are the details of the tickets with the highest priority?<|im_end|> +# <|im_start|>assistant> +# SELECT subject, FROM tickets ORDER BY priority DESC LIMIT 10;<|im_end|> +def extract_sql_query(message: str) -> str: + pattern = r"```sql\n(.*?)\n```" + match = re.search(pattern, message, re.DOTALL) + + if match: + return match.group(1) + else: + sql_query = postprocess_hallucinations(message) + + return sql_query + + +def write_query(state: State, query_prompt_template, llm, db): + """Generate SQL query to fetch information.""" + prompt = query_prompt_template.invoke( + { + "dialect": db.dialect, + "top_k": 10, + "table_info": db.get_table_info(), + "input": state["question"], + } + ) + # structured output wasn't implemented for the RUBRA-phi3 model so had to move to the + # raw llm invoke. If we can move to a different function-calling LLM, we can uncomment + # this. + # structured_llm = llm.with_structured_output(QueryOutput) + # result = structured_llm.invoke(prompt) + + logger.info(f"Prompt for SQL query generation: {prompt}") + result = llm.invoke(prompt) + + sql_query = extract_sql_query(result.content) + + logger.info(f"Extracted SQL query: {sql_query}") + + return {"query": sql_query} + + +# Executes a SQL query against the provided database +def execute_query(state: State, db): + """Execute SQL query.""" + execute_query_tool = QuerySQLDataBaseTool(db=db) + return {"result": execute_query_tool.invoke(state["query"])} + + +# Trims a string to fit within a given token limit using a model-specific tokenizer. +def trim_text_by_tokens(text: str, model_id: str, token_limit: int) -> str: + + # Load a tokenizer that is specific to the model + tokenizer = AutoTokenizer.from_pretrained(model_id) + + tokens = tokenizer(text)["input_ids"] + + # If within the limit, return the original text + if len(tokens) <= token_limit: + return text + + # Trim the tokens to the allowed limit + trimmed_tokens = tokens[:token_limit] + + # Decode back into text + trimmed_text = tokenizer.decode(trimmed_tokens, skip_special_tokens=True) + + return trimmed_text + + +# Answer user's question in natural language using SQL query and SQL query results from the +# database as context. +def convert_sql_result_to_nl(state: State, model_id, llm): + + domainExpertInstructions = "In the provided SQL table, each entry or row refers to a ticket and not a customer." + " The column titled requester is also referred to as the customer or submitter or client." + " The column titled all_comments can also be referred to as responses or resolution or details." + + prompt = ( + "You are a customer support ticket expert. Given the following user question, corresponding SQL query, " + "and SQL result, answer the user's question." + "Do not make any references to the SQL query or the SQL result in your answer." + + domainExpertInstructions + + f'Question: {state["question"]}\n' + f'SQL Query: {state["query"]}\n' + f'SQL Result: {state["result"]}' + ) + logger.info( + f"Prompt for SQL result to NL conversion: {prompt}. Prompt length: {len(prompt)}" + ) + + # Prompt length has to be smaller than model max because of errors like this: + # This model's maximum context length is 8192 tokens. However, you requested 8220 + # tokens in the messages, Please reduce the length of the messages.", + # 'type': 'BadRequestError', + # 'param': None, 'code': 400} + PROMPT_TRIM_LENGTH = MODEL_MAX_CONTEXT_LEN - delta + trimmed_prompt = trim_text_by_tokens(prompt, model_id, PROMPT_TRIM_LENGTH) + logger.info(f"Trimmed prompt: {trimmed_prompt}") + + response = llm.invoke(trimmed_prompt) + logger.info(f"LLM generated NL answer to user question: {response.content}") + + return {"answer": response.content} + + +def get_answer_with_settings_with_weaviate_filter( + question, + vectorstore, + client, + model_id, + max_tokens, + model_temperature, + system_prompt, + relevant_docs, + llm_server_url, + sql_search_db_and_model_path, +): + + search_type = question_router(question, sql_search_db_and_model_path) + logger.info(f"Chosen search type: {search_type} for question: {question}") + + match search_type: + case SearchType.SQL: + logger.info("Handling search type: SQL") + + return get_sql_answer( + question, + model_id, + max_tokens, + model_temperature, + llm_server_url, + sql_search_db_and_model_path, + ) + + case SearchType.VECTOR: + logger.info("Handling search type: VECTOR") + + alpha = float( + os.getenv("WEAVIATE_HYBRID_ALPHA", WEAVIATE_HYBRID_ALPHA_DEFAULT) + ) + logger.info(f"Choosing WEAVIATE_HYBRID_ALPHA value: {alpha}") + + # https://weaviate.io/blog/hybrid-search-explained#a-simple-hybrid-search-pipeline-in-weaviate + # alpha = 0 -> pure keyword search + # alpha = 0.5 -> equal weighing of keyword and vector search + # alpha = 1 -> pure vector search + search_kwargs = { + "k": relevant_docs, + "alpha": alpha, + } + + retriever = vectorstore.as_retriever( + # search_type="mmr", + search_kwargs=search_kwargs, + ) + logger.info("Created Vector DB retriever successfully. \n") + + return get_answer_with_settings( + question, + retriever, + client, + model_id, + max_tokens, + model_temperature, + system_prompt, + ) + + # from typing import List + # + # from langchain_core.documents import Document + # from langchain_core.runnables import chain + + # @chain + # def retriever(query: str) -> List[Document]: + # docs, scores = zip(*vectorstore.similarity_search_with_score(query, k=relevant_docs, alpha=0.5)) + # for doc, score in zip(docs, scores): + # print("----> ", score) + # doc.metadata["score"] = score + # + # return docs + + # ticket_id = extract_zendesk_ticket_id(query=question) + # + # if ticket_id: + # from weaviate.collections.classes.filters import Filter + # + # logger.info(f"Using ticket id {ticket_id} filter") + # # Use Weaviate’s `Filter` class to build the filter + # search_kwargs["filters"] = Filter.by_property("ticket").equal(ticket_id) + + +def extract_zendesk_ticket_id(query): + # TODO: implement smth smarter + + # Check if the word "ticket" exists in the query (case insensitive) + if not re.search(r"\bticket\b", query, re.IGNORECASE): + return None # Return None if "ticket" is not present + + # Extract numeric ticket ID (assumes tickets are six digit numbers) + match = re.search(r"\b\d{6,}\b", query) + return match.group(0) if match else None + + +def predict_question_type(question, model, tfidf, id_to_category): + + logger.info(f"Received question {question} for classification") + # Transform the input question into TF-IDF feature representation + question_tfidf = tfidf.transform([question]).toarray() + + # Predict the category ID + predicted_category_id = model.predict(question_tfidf)[0] + + # Convert category ID back to label + predicted_category = id_to_category[predicted_category_id] + + logger.info(f"Question: {question}, Predicted Category: {predicted_category}") + return predicted_category_id + + +def load_models(question_classification_model_path: str): + # Load the saved model + rf_model_path = question_classification_model_path + "random_forest_model.pkl" + rf_model_loaded = joblib.load(rf_model_path) + + # Load the saved TF-IDF vectorizer + tfidf_path = question_classification_model_path + "tfidf_vectorizer.pkl" + tfidf_loaded = joblib.load(tfidf_path) + + logger.info("Model and vectorizer loaded successfully.") + return rf_model_loaded, tfidf_loaded + + +def question_router( + question: str, question_classification_model_path: str +) -> SearchType: + logger.info("In question router...") + rf_model_loaded, tfidf_loaded = load_models(question_classification_model_path) + id_to_category = {0: "aggregation", 1: "pointed"} + predicted_category = predict_question_type( + question, rf_model_loaded, tfidf_loaded, id_to_category + ) + print( + "Received question: ", + question, + "\nPredicted Question Type:", + predicted_category, + ) + + # If question is of type aggregation or has any alphanumeric words + if predicted_category == 0 or containsSymbolsOrNumbers(question): + logger.info("Choosing search type: SQL") + return SearchType.SQL + + logger.info("Choosing search type: VECTOR/TEXT") + return SearchType.VECTOR + + +def containsSymbolsOrNumbers(question: str) -> bool: + words = question.split() + for i, word in enumerate(words): + # skip words ending with '?' or if the last word is just '?' + if word.endswith("?") or (i == len(words) - 1 and word == "?"): + continue + + if re.search( + r"[^a-zA-Z]", word + ): # Check if word contains anything other than letters + return True + return False diff --git a/dockers/llm.rag.service/common_test.py b/dockers/llm.rag.service/common_test.py new file mode 100644 index 0000000..6117144 --- /dev/null +++ b/dockers/llm.rag.service/common_test.py @@ -0,0 +1,23 @@ +from common import trim_answer + +test_string = """ +A recent SSH issue customers had was related to SSH access being flagged as a problem in their old environment because it was under a reseller account. This issue was identified as a "Won\'t Fix" and the Linux Support Engineer informed the customer that they would need to let their security vendor know about these details so they can be excluded from the report as "False Positives."<|im_end|> + +<|im_start|>user +Compose a detailed email to a colleague explaining the importance of using the 'git rebase' command over 'git merge' for integrating changes from a feature branch into the main branch, including the potential impact on the project's history and the benefits of a cleaner commit history. + +<|im_end|>second not important<|im_start|> + +## Your task: +Dear [Colleague's Name], +I hope this message finds you well. I wanted to take a moment to discuss an important aspect of our version control practices, specifically regarding the integration of changes from feature branches into our main branch. +As you know, maintaining a clean and understandable project history is crucial for the long-term maintainability and collaboration efficiency of our codebase. This is where the 'git rebase' command comes into play, offering a significant advantage over the traditional 'git merge' approach. +When we use 'git merge', we create a new commit that combines the changes from the feature branch with the main branch. This results in a merge commit, which can clutter our project history with unnecessary merge points. Over time, as we continue to merge feature branches, our history can become a tangled web of commits, making it difficult to track the +""" + + +def test_clean_all_after_first_im_end(): + assert ( + trim_answer(test_string, "<|im_end|>") + == 'A recent SSH issue customers had was related to SSH access being flagged as a problem in their old environment because it was under a reseller account. This issue was identified as a "Won\'t Fix" and the Linux Support Engineer informed the customer that they would need to let their security vendor know about these details so they can be excluded from the report as "False Positives."' + ) diff --git a/dockers/llm.rag.service/makeDocker.sh b/dockers/llm.rag.service/makeDocker.sh index 8965b5d..3b0c7a8 100755 --- a/dockers/llm.rag.service/makeDocker.sh +++ b/dockers/llm.rag.service/makeDocker.sh @@ -6,7 +6,19 @@ set -e SERVE_RAG_LLM_REPO=$1 SERVE_RAG_LLM_TAG=$2 +# Get the directory where the script is located +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )" + +echo "Script directory: ${SCRIPT_DIR}" +echo "Current working directory: $(pwd)" +echo "Directory contents:" +ls -la "${SCRIPT_DIR}" + echo "" echo "Building docker for rag+llm service" -docker build --platform=linux/amd64 --load -f ./Dockerfile -t ${SERVE_RAG_LLM_REPO}:${SERVE_RAG_LLM_TAG} . +docker buildx build --platform=linux/amd64 --load \ + -f "${SCRIPT_DIR}/Dockerfile" \ + -t ${SERVE_RAG_LLM_REPO}:${SERVE_RAG_LLM_TAG} \ + "${SCRIPT_DIR}" + docker push ${SERVE_RAG_LLM_REPO}:${SERVE_RAG_LLM_TAG} diff --git a/dockers/llm.rag.service/proxy_app.py b/dockers/llm.rag.service/proxy_app.py new file mode 100644 index 0000000..ae77caf --- /dev/null +++ b/dockers/llm.rag.service/proxy_app.py @@ -0,0 +1,26 @@ +import os + +import uvicorn + + +def main(): + # Check environment variables to determine which app to run + weaviate_url = os.getenv("WEAVIATE_URI_WITH_PORT") + weaviate_grpc_url = os.getenv("WEAVIATE_GRPC_URI_WITH_PORT") + weaviate_index = os.getenv("WEAVIATE_INDEX_NAME") + + host = os.environ.get("APP_HOST", "0.0.0.0") + port = int(os.environ.get("APP_PORT", "8000")) + + if weaviate_url and weaviate_grpc_url and weaviate_index: + # Run the CSV to Weaviate app + print(f"Starting Weaviate app on {host}:{port}") + uvicorn.run("serverragllm_csv_to_weaviate_local:app", host=host, port=port) + else: + # Run the default app + print(f"Starting default app on {host}:{port}") + uvicorn.run("serveragllm:app", host=host, port=port) + + +if __name__ == "__main__": + main() diff --git a/dockers/llm.rag.service/pyproject.toml b/dockers/llm.rag.service/pyproject.toml index 7d74f76..87e761d 100644 --- a/dockers/llm.rag.service/pyproject.toml +++ b/dockers/llm.rag.service/pyproject.toml @@ -3,3 +3,6 @@ name = "ragllm" # Required version = "1.0.0" # Required description = "Web service question/answer RAG-enhanced LLM model" requires-python = ">=3.8" + +[tool.setuptools] +py-modules = ["common", "createvectordb", "createvectordb_jira_csv_local"] \ No newline at end of file diff --git a/dockers/llm.rag.service/requirements-dev.txt b/dockers/llm.rag.service/requirements-dev.txt new file mode 100644 index 0000000..55b033e --- /dev/null +++ b/dockers/llm.rag.service/requirements-dev.txt @@ -0,0 +1 @@ +pytest \ No newline at end of file diff --git a/dockers/llm.rag.service/requirements.txt b/dockers/llm.rag.service/requirements.txt new file mode 100644 index 0000000..55bc420 --- /dev/null +++ b/dockers/llm.rag.service/requirements.txt @@ -0,0 +1,13 @@ +uvicorn +faiss-cpu +fastapi +langchain-community +langchain-huggingface +openai +uvicorn +pandas +sqlalchemy +langchain-openai +langchain_community +pandas +weaviate diff --git a/dockers/llm.rag.service/serveragllm.py b/dockers/llm.rag.service/serveragllm.py index 9e418e0..05e3824 100644 --- a/dockers/llm.rag.service/serveragllm.py +++ b/dockers/llm.rag.service/serveragllm.py @@ -1,19 +1,39 @@ -from typing import Union -from fastapi import FastAPI - +import logging import os +import pickle import sys -import openai -from langchain_community.embeddings import HuggingFaceEmbeddings -from langchain_community.vectorstores import FAISS +from enum import Enum +from logging.handlers import TimedRotatingFileHandler +from typing import Any, Dict, List, Union import boto3 -import pickle -import time +from botocore.exceptions import ClientError, NoCredentialsError +from fastapi import FastAPI +from openai import OpenAI + +from common import get_answer_with_settings, get_sql_answer ######## # Setup model name and query template parameters -model = "mosaicml--mpt-7b-chat" +MICROSOFT_MODEL_ID = "microsoft/Phi-3-mini-4k-instruct" +MOSAICML_MODEL_ID = "mosaicml/mpt-7b-chat" +RELEVANT_DOCS_DEFAULT = 2 +MAX_TOKENS_DEFAULT = 256 +MODEL_TEMPERATURE_DEFAULT = 0.01 +MODEL_ID_DEFAULT = MOSAICML_MODEL_ID +SQL_SEARCH_DB_AND_MODEL_PATH_DEFAULT = "/app/db/" + + +SYSTEM_PROMPT_DEFAULT = """You are a specialized support ticket assistant. Format your responses following these rules: + 1. Answer the provided question only using the provided context. + 2. Do not add the provided context to the generated answer. + 3. Include relevant technical details when present or provide a summary of the comments in the ticket. + 4. Include the submitter, assignee and collaborator for a ticket when this info is available. + 5. If the question cannot be answered with the given context, please say so and do not attempt to provide an answer. + 6. Do not create new questions related to the given question, instead answer only the provided question. + 7. Provide a clear, direct and factual answer. + """ + template = """Answer the question based only on the following context: {context} @@ -21,45 +41,263 @@ """ os.environ["TOKENIZERS_PARALLELISM"] = "false" + +class SearchType(Enum): + SQL = 1 + VECTOR = 2 + + +def str_to_int(value, name): + try: + # Convert the environment variable (or default) to an integer + int_value = int(value) + except ValueError: + logging.error( + f"Error: Value {name} could not be converted to an integer value, please check." + ) + sys.exit(1) + return int_value + + +def str_to_float(value, name): + try: + # Convert the environment variable (or default) to an integer + float_value = float(value) + except ValueError: + logging.error( + f"Error: Value {name} could not be converted to an float value, please check." + ) + sys.exit(1) + return float_value + + +######## +# Fetch RAG context for question, form prompt from context and question, and call model +def get_answer(question: Union[str, None]): + + logging.info(f"In get_answer, received question: {question}") + + model_id = os.environ.get("MODEL_ID") + if model_id == "" or model_id is None: + model_id = MODEL_ID_DEFAULT + logging.info(f"Using Model ID: {model_id}") + + model_temperature = os.environ.get("MODEL_TEMPERATURE") + if model_temperature == "" or model_temperature is None: + model_temperature = MODEL_TEMPERATURE_DEFAULT + else: + model_temperature = str_to_float(model_temperature, "MODEL_TEMPERATURE") + logging.info(f"Using Model Temperature: {model_temperature}") + + max_tokens = os.environ.get("MAX_TOKENS") + if max_tokens == "" or max_tokens is None: + max_tokens = MAX_TOKENS_DEFAULT + else: + max_tokens = str_to_int(max_tokens, "MAX_TOKENS") + logging.info(f"Using Max Tokens: {max_tokens}") + + relevant_docs = os.environ.get("RELEVANT_DOCS") + if relevant_docs == "" or relevant_docs is None: + relevant_docs = RELEVANT_DOCS_DEFAULT + else: + relevant_docs = str_to_int(relevant_docs, "RELEVANT_DOCS") + logging.info(f"Using top-k search from Vector DB, k: {relevant_docs}") + + is_json_mode = os.environ.get("IS_JSON_MODE", "False") == "True" + logging.info(f"Using is_json_mode: {is_json_mode}") + + system_prompt = os.environ.get("SYSTEM_PROMPT") + if system_prompt == "" or system_prompt is None: + system_prompt = SYSTEM_PROMPT_DEFAULT + logging.info(f"Using System Prompt: {system_prompt}") + + # TODO: Add question classification block + + search_type_config = os.environ.get("SEARCH_TYPE", "SQL") + logging.info(f"Using search type config: {search_type_config}") + + match search_type_config: + case "SQL": + search_type = SearchType.SQL + case "VECTOR": + search_type = SearchType.VECTOR + + logging.info(f"Using search type: {search_type}") + + sql_search_db_and_model_path = os.getenv( "SQL_SEARCH_DB_AND_MODEL_PATH", SQL_SEARCH_DB_AND_MODEL_PATH_DEFAULT) + logging.info(f"Using sql db and model path: {sql_search_db_and_model_path}") + + if is_json_mode: + logging.info("Sending query to the LLM (JSON mode)...") + + match search_type: + case SearchType.SQL: + logging.info("Handling search type: SQL") + + return get_sql_answer( + question, + model_id, + max_tokens, + model_temperature, + llm_server_url, + sql_search_db_and_model_path, + ) + + case SearchType.VECTOR: + logging.info("Handling search type: VECTOR") + + logging.info("Retrieving docs relevant to the input question") + docs = retriever.invoke(input=question) + num_of_docs = len(docs) + logging.info( + f"Number of relevant documents retrieved and that will be used as context for query: {num_of_docs}" + ) + + # Retriever configuration parameters reference: + # https://python.langchain.com/api_reference/community/vectorstores/langchain_community.vectorstores.faiss.FAISS.html#langchain_community.vectorstores.faiss.FAISS.as_retriever + retriever = vectorstore.as_retriever(search_kwargs={"k": relevant_docs}) + logging.info("Created Vector DB retriever successfully. \n") + + logging.info( + "Creating an OpenAI client to the hosted model at URL: ", + llm_server_url, + ) + try: + client = OpenAI(base_url=llm_server_url, api_key="n/a") + except Exception as e: + logging.error("Error creating client:", e) + sys.exit(1) + + return get_answer_with_settings( + question, + retriever, + client, + model_id, + max_tokens, + model_temperature, + system_prompt, + ) + else: + logging.info("Sending query to the LLM (non JSON mode)...") + + # concatenate relevant docs retrieved to be used as context + allcontext = "" + for i in range(len(docs)): + allcontext += docs[i].page_content + promptstr = template.format(context=allcontext, question=question) + + completions = client.chat.completions.create( + model=model_id, + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": promptstr, + }, + ], + max_tokens=max_tokens, + temperature=model_temperature, + stream=False, + ) + + answer = completions.choices[0].message.content + logging.info(f"Received answer (from non JSON processing): {answer}") + return answer + + +######## +# Setup logging + +# When running locally: export RAGLLM_LOGS_PATH=logs/ragllm.log +log_file_path = os.getenv("RAGLLM_LOGS_PATH") or "/app/logs/ragllm.log" +os.makedirs( + os.path.dirname(log_file_path), exist_ok=True +) # Ensure log directory exists +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + handlers=[ + # Log to file, rotate every 1H and store files from last 24 hrs * 7 days files == 168H data + TimedRotatingFileHandler(log_file_path, when="h", interval=1, backupCount=168), + logging.StreamHandler(), # Also log to console + ], +) + ######## # Get connection to LLM server -model_llm_server_url = os.environ.get('MODEL_LLM_SERVER_URL') +model_llm_server_url = os.environ.get("MODEL_LLM_SERVER_URL") if model_llm_server_url is None: - print("Please set environment variable MODEL_LLM_SERVER_URL") + model_llm_server_url = ( + "http://llm-model-serve-serve-svc.default.svc.cluster.local:8000" + ) + logging.info( + f"Setting environment variable MODEL_LLM_SERVER_URL to default value: {model_llm_server_url}" + ) +llm_server_url = model_llm_server_url + "/v1" + +logging.info(f"Creating an OpenAI client to the hosted model at URL: {llm_server_url}") +try: + client = OpenAI(base_url=llm_server_url, api_key="na") +except Exception as e: + logging.error(f"Error creating client to self-hosted LLM: {e}") sys.exit(1) -llm_server_url = model_llm_server_url + '/v1' -client = openai.OpenAI(base_url=llm_server_url, api_key='na') ######## # Load vectorstore and get retriever for it -vectordb_bucket = "faiss-vectordbs" -vectordb_key = os.environ.get('VECTOR_DB_S3_FILE') + +# get env vars needed to access Vector DB +vectordb_bucket = os.environ.get("VECTOR_DB_S3_BUCKET") +if vectordb_bucket is None: + logging.error("Please set environment variable VECTOR_DB_S3_BUCKET") + sys.exit(1) +logging.info(f"Using Vector DB S3 bucket: {vectordb_bucket}") + +vectordb_key = os.environ.get("VECTOR_DB_S3_FILE") if vectordb_key is None: - print("Please set environment variable VECTOR_DB_S3_FILE") + logging.error("Please set environment variable VECTOR_DB_S3_FILE") + sys.exit(1) +logging.info(f"Using Vector DB S3 file: {vectordb_key}") + +relevant_docs = os.environ.get("RELEVANT_DOCS") +if relevant_docs == "" or relevant_docs is None: + relevant_docs = RELEVANT_DOCS_DEFAULT +else: + relevant_docs = str_to_int(relevant_docs, "RELEVANT_DOCS") +logging.info(f"Using top-k search from Vector DB, {relevant_docs}") + +# Use s3 client to read in vector store +s3_client = boto3.client("s3") +response = None +try: + response = s3_client.get_object(Bucket=vectordb_bucket, Key=vectordb_key) +except ClientError as e: + logging.error( + f"Error accessing object, {vectordb_key} in bucket, {vectordb_bucket}, err: {e}" + ) sys.exit(1) -s3_client = boto3.client('s3') -response = s3_client.get_object(Bucket=vectordb_bucket, Key=vectordb_key) -print(response) -body = response['Body'].read() +body = response["Body"].read() + +logging.info("Loading Vector DB...") +# needs prereq packages: sentence_transformers and faiss-cpu vectorstore = pickle.loads(body) -retriever = vectorstore.as_retriever() -time.sleep(30) -######## -# Fetch RAG context for question, form prompt from context and question, and call model -def get_answer(question: Union[str, None]): - docs = retriever.get_relevant_documents(question) - promptstr = template.format(context=docs[0].page_content, question=question) - completions = client.completions.create(prompt=promptstr, model=model, max_tokens=64, temperature=0.1) - print("Question: ", question) - print("Completions: ", completions) - answer = completions.choices[0].text + "\n" - return answer +# Retriever configuration parameters reference: +# https://python.langchain.com/api_reference/community/vectorstores/langchain_community.vectorstores.faiss.FAISS.html#langchain_community.vectorstores.faiss.FAISS.as_retriever +retriever = vectorstore.as_retriever(search_kwargs={"k": relevant_docs}) +logging.info("Created Vector DB retriever successfully.") + +# Uncomment to run a local test +# logging.info("Testing with a sample question:") +# get_answer("What's a recent SSH issue customers had?") ######## -# Start API service to answer question +# Start API service to answer questions app = FastAPI() + + @app.get("/answer/{question}") def read_item(question: Union[str, None] = None): + logging.info(f"Received question: {question}") answer = get_answer(question) + logging.info(f"Received answer: {answer}") return {"question": question, "answer": answer} diff --git a/dockers/llm.rag.service/serverragllm_csv_to_weaviate_local.py b/dockers/llm.rag.service/serverragllm_csv_to_weaviate_local.py new file mode 100644 index 0000000..7e92db1 --- /dev/null +++ b/dockers/llm.rag.service/serverragllm_csv_to_weaviate_local.py @@ -0,0 +1,173 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "faiss-cpu", +# "fastapi", +# "langchain-community", +# "langchain-huggingface", +# "openai", +# "uvicorn", +# "weaviate-client", +# "langchain_weaviate", +# ] +# /// + +import os +import sys +from functools import partial +from typing import Union + +import click +import uvicorn +from fastapi import FastAPI +from openai import OpenAI + +from common import get_answer_with_settings_with_weaviate_filter, logger + +SYSTEM_PROMPT_DEFAULT = """You are a specialized support ticket assistant. Format your responses following these rules: + 1. Answer the provided question only using the provided context. + 2. Do not add the provided context to the generated answer. + 3. Include relevant technical details when present or provide a summary of the comments in the ticket. + 4. Include the submitter, assignee and collaborator for a ticket when this info is available. + 5. If the question cannot be answered with the given context, please say so and do not attempt to provide an answer. + 6. Do not create new questions related to the given question, instead answer only the provided question. + 7. Provide a clear, direct and factual answer. + """ + + +def setup( + relevant_docs: int, + llm_server_url: str, + model_id: str, + max_tokens: int, + model_temperature: float, + weaviate_url: str, + weaviate_grpc_url: str, + weaviate_index: str, + embedding_model_name: str, + sql_search_db_and_model_path: str, +): + app = FastAPI() + + # TODO: move to imports + import weaviate + from langchain_huggingface import HuggingFaceEmbeddings + from langchain_weaviate.vectorstores import WeaviateVectorStore + + embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name) + + weaviate_client = weaviate.connect_to_custom( + http_host=weaviate_url.split(":")[0], + http_port=int(weaviate_url.split(":")[1]), + http_secure=False, + grpc_host=weaviate_grpc_url.split(":")[0], + grpc_port=int(weaviate_grpc_url.split(":")[1]), + grpc_secure=False, + ) + + vectorstore = WeaviateVectorStore( + client=weaviate_client, + index_name=weaviate_index, + text_key="text", + embedding=embeddings, + ) + + if not llm_server_url.endswith("/v1"): + llm_server_url = llm_server_url + "/v1" + logger.info( + f"Creating an OpenAI client to the hosted model at URL: {llm_server_url}" + ) + try: + client = OpenAI(base_url=llm_server_url, api_key="na") + except Exception as e: + logger.error(f"Error creating client: {e}") + sys.exit(1) + + get_answer = partial( + get_answer_with_settings_with_weaviate_filter, + vectorstore=vectorstore, + client=client, + model_id=model_id, + max_tokens=max_tokens, + model_temperature=model_temperature, + system_prompt=SYSTEM_PROMPT_DEFAULT, + relevant_docs=relevant_docs, + llm_server_url=llm_server_url, + sql_search_db_and_model_path=sql_search_db_and_model_path, + ) + + @app.get("/answer/{question}") + def read_item(question: Union[str, None] = None): + print(f"Received question: {question}") + answer = get_answer(question) + return {"question": question, "answer": answer} + + return app + + +MICROSOFT_MODEL_ID = "microsoft/Phi-3-mini-4k-instruct" +MOSAICML_MODEL_ID = "mosaicml/mpt-7b-chat" +RELEVANT_DOCS_DEFAULT = 2 +MAX_TOKENS_DEFAULT = 256 +MODEL_TEMPERATURE_DEFAULT = 0.01 +SQL_SEARCH_DB_AND_MODEL_PATH_DEFAULT = "/app/db/" + +relevant_docs = int(os.getenv("RELEVANT_DOCS", RELEVANT_DOCS_DEFAULT)) + +# llm_server_url = os.getenv("MODEL_LLM_SERVER_URL", "http://localhost:11434/v1") +# model_id = os.getenv("MODEL_ID", "llama2") + +llm_server_url = os.getenv("MODEL_LLM_SERVER_URL", "http://localhost:9000/v1") +# model_id = os.getenv("MODEL_ID", "microsoft/Phi-3-mini-4k-instruct") +model_id = os.getenv("MODEL_ID", "rubra-ai/Phi-3-mini-128k-instruct") + +max_tokens = int(os.getenv("MAX_TOKENS", MAX_TOKENS_DEFAULT)) +model_temperature = float(os.getenv("MODEL_TEMPERATURE", MODEL_TEMPERATURE_DEFAULT)) + +weaviate_url = os.getenv("WEAVIATE_URI_WITH_PORT", "localhost:8080") +weaviate_grpc_url = os.getenv("WEAVIATE_GRPC_URI_WITH_PORT", "localhost:50051") +weaviate_index = os.getenv("WEAVIATE_INDEX_NAME", "my_custom_index") + +embedding_model_name = os.getenv( + "EMBEDDING_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2" +) +# embedding_model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1" + +sql_search_db_and_model_path = os.getenv( + "SQL_SEARCH_DB_AND_MODEL_PATH", SQL_SEARCH_DB_AND_MODEL_PATH_DEFAULT +) + +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +app = setup( + relevant_docs, + llm_server_url, + model_id, + max_tokens, + model_temperature, + weaviate_url, + weaviate_grpc_url, + weaviate_index, + embedding_model_name, + sql_search_db_and_model_path, +) + + +@click.command() +@click.option( + "--host", + default="127.0.0.1", + help="Host for the FastAPI server (default: 127.0.0.1)", +) +@click.option( + "--port", type=int, default=8000, help="Port for the FastAPI server (default: 8000)" +) +def run(host, port): + # Serve the app using Uvicorn + uvicorn.run( + "serverragllm_csv_to_weaviate_local:app", host=host, port=port, reload=True + ) + + +if __name__ == "__main__": + run() diff --git a/dockers/llm.rag.service/serverragllm_jira_cvs_local.py b/dockers/llm.rag.service/serverragllm_jira_cvs_local.py new file mode 100644 index 0000000..506d1eb --- /dev/null +++ b/dockers/llm.rag.service/serverragllm_jira_cvs_local.py @@ -0,0 +1,117 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "faiss-cpu", +# "fastapi", +# "langchain-community", +# "langchain-huggingface", +# "openai", +# "uvicorn", +# ] +# /// + +import os +import pickle +import sys +from functools import partial +from typing import Union + +import click +import uvicorn +from fastapi import FastAPI +from openai import OpenAI + +from common import get_answer_with_settings + + +def setup( + file_path: str, + relevant_docs: int, + llm_server_url: str, + model_id: str, + max_tokens: int, + model_temperature: float, +): + app = FastAPI() + + # Load the object from the pickle file + with open(file_path, "rb") as file: + print("Loading Vector DB...\n") + vectorstore = pickle.load(file) + + # Retriever configuration parameters reference: + # https://python.langchain.com/api_reference/community/vectorstores/langchain_community.vectorstores.faiss.FAISS.html#langchain_community.vectorstores.faiss.FAISS.as_retriever + retriever = vectorstore.as_retriever(search_kwargs={"k": relevant_docs}) + print("Created Vector DB retriever successfully. \n") + + print("Creating an OpenAI client to the hosted model at URL: ", llm_server_url) + try: + client = OpenAI(base_url=llm_server_url, api_key="na") + except Exception as e: + print("Error creating client:", e) + sys.exit(1) + + jira_system_prompt = """You are a specialized support ticket assistant. Format your responses following these rules: + 1. Answer the provided question only using the provided context. + 2. Provide a clear, direct and factual answer + 3. Include relevant technical details when present + 4. If the information is outdated, mention when it was last updated + """ + + get_answer = partial( + get_answer_with_settings, + retriever=retriever, + client=client, + model_id=model_id, + max_tokens=max_tokens, + model_temperature=model_temperature, + system_prompt=jira_system_prompt, + ) + + @app.get("/answer/{question}") + def read_item(question: Union[str, None] = None): + print(f"Received question: {question}") + answer = get_answer(question) + return {"question": question, "answer": answer} + + return app + + +MICROSOFT_MODEL_ID = "microsoft/Phi-3-mini-4k-instruct" +MOSAICML_MODEL_ID = "mosaicml/mpt-7b-chat" +RELEVANT_DOCS_DEFAULT = 2 +MAX_TOKENS_DEFAULT = 64 +MODEL_TEMPERATURE_DEFAULT = 0.01 + + +file_path = os.getenv("FILE_PATH") +if not file_path: + print("Please provide the pickeled vector store path") + +relevant_docs = int(os.getenv("RELEVANT_DOCS", RELEVANT_DOCS_DEFAULT)) +llm_server_url = os.getenv("LLM_SERVER_URL", "http://localhost:11434/v1") +model_id = os.getenv("MODEL_ID", "llama2") +max_tokens = int(os.getenv("MAX_TOKENS", MAX_TOKENS_DEFAULT)) +model_temperature = float(os.getenv("MODEL_TEMPERATURE", MODEL_TEMPERATURE_DEFAULT)) + +app = setup( + file_path, relevant_docs, llm_server_url, model_id, max_tokens, model_temperature +) + + +@click.command() +@click.option( + "--host", + default="127.0.0.1", + help="Host for the FastAPI server (default: 127.0.0.1)", +) +@click.option( + "--port", type=int, default=8000, help="Port for the FastAPI server (default: 8000)" +) +def run(host, port): + # Serve the app using Uvicorn + uvicorn.run("serverragllm_jira_cvs_local:app", host=host, port=port, reload=True) + + +if __name__ == "__main__": + run() diff --git a/dockers/llm.rag.service/serverragllm_zendesk_csv_sql_local.py b/dockers/llm.rag.service/serverragllm_zendesk_csv_sql_local.py new file mode 100644 index 0000000..762de89 --- /dev/null +++ b/dockers/llm.rag.service/serverragllm_zendesk_csv_sql_local.py @@ -0,0 +1,159 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "faiss-cpu", +# "fastapi", +# "langchain-community", +# "langchain-huggingface", +# "openai", +# "uvicorn", +# ] +# /// + +import logging +import os +import pickle +import sys +from functools import partial +from typing import Union + +import click +import uvicorn +from fastapi import FastAPI +from openai import OpenAI + +from common import SearchType, get_answer_with_settings, get_sql_answer, question_router + + +def setup( + file_path: str, + relevant_docs: int, + llm_server_url: str, + model_id: str, + max_tokens: int, + model_temperature: float, + sql_search_db_and_model_path: str, +): + app = FastAPI() + + # TO DO: Add question classification block + # search_type = question_router(question) + + # For now, hard-coding question type to aggregation + search_type = SearchType.SQL + + match search_type: + case SearchType.SQL: + logging.info("Handling search type: SQL") + + get_answer = partial( + get_sql_answer, + model_id=model_id, + max_tokens=max_tokens, + model_temperature=model_temperature, + llm_server_url=llm_server_url, + sql_search_db_and_model_path=sql_search_db_and_model_path, + ) + case SearchType.VECTOR: + logging.info("Handling search type: VECTOR") + + # Load the object from the pickle file + with open(file_path, "rb") as file: + logging.info("Loading Vector DB...\n") + vectorstore = pickle.load(file) + + # Retriever configuration parameters reference: + # https://python.langchain.com/api_reference/community/vectorstores/langchain_community.vectorstores.faiss.FAISS.html#langchain_community.vectorstores.faiss.FAISS.as_retriever + retriever = vectorstore.as_retriever(search_kwargs={"k": relevant_docs}) + logging.info("Created Vector DB retriever successfully. \n") + if not llm_server_url.endswith("/v1"): + llm_server_url = llm_server_url + "/v1" + logging.info( + "Creating an OpenAI client to the hosted model at URL: ", llm_server_url + ) + + try: + client = OpenAI(base_url=llm_server_url, api_key="na") + except Exception as e: + logging.error("Error creating client:", e) + sys.exit(1) + + jira_system_prompt = """You are a specialized support ticket assistant. Format your responses following these rules: + 1. Answer the provided question only using the provided context. + 2. Provide a clear, direct and factual answer + 3. Include relevant technical details when present + 4. If the information is outdated, mention when it was last updated + """ + + get_answer = partial( + get_answer_with_settings, + retriever=retriever, + client=client, + model_id=model_id, + max_tokens=max_tokens, + model_temperature=model_temperature, + system_prompt=jira_system_prompt, + llm_server_url=llm_server_url, + ) + + @app.get("/answer/{question}") + def read_item(question: Union[str, None] = None): + logging.info(f"Received question: {question}") + answer = get_answer(question) + return {"question": question, "answer": answer} + + return app + + +MICROSOFT_MODEL_ID = "microsoft/Phi-3-mini-4k-instruct" +MOSAICML_MODEL_ID = "mosaicml/mpt-7b-chat" +RELEVANT_DOCS_DEFAULT = 2 +MAX_TOKENS_DEFAULT = 350 +MODEL_TEMPERATURE_DEFAULT = 0.00 + + +file_path = os.getenv("FILE_PATH") +if not file_path: + logging.info("Please provide the DB file path") + +logging.info( + "Setting LLM setup parameters like LLM server URL, model id, model temperature..." +) +relevant_docs = int(os.getenv("RELEVANT_DOCS", RELEVANT_DOCS_DEFAULT)) + +# LLM server URL if using ollama hosting locally +# llm_server_url = os.getenv("LLM_SERVER_URL", "http://localhost:11434/v1") +# model_id = os.getenv("MODEL_ID", "llama2") + +# LLM server URL if using k8s elotl hosting + port-forwarding +llm_server_url = os.getenv("MODEL_LLM_SERVER_URL", "http://localhost:9000/v1") +model_id = os.getenv("MODEL_ID", "rubra-ai/Phi-3-mini-128k-instruct") + +max_tokens = int(os.getenv("MAX_TOKENS", MAX_TOKENS_DEFAULT)) +model_temperature = float(os.getenv("MODEL_TEMPERATURE", MODEL_TEMPERATURE_DEFAULT)) + +sql_search_db_and_model_path = os.getenv("SQL_SEARCH_DB_AND_MODEL_PATH", "/app/db/") + +app = setup( + file_path, relevant_docs, llm_server_url, model_id, max_tokens, model_temperature, sql_search_db_and_model_path +) + + +@click.command() +@click.option( + "--host", + default="127.0.0.1", + help="Host for the FastAPI server (default: 127.0.0.1)", +) +@click.option( + "--port", type=int, default=8000, help="Port for the FastAPI server (default: 8000)" +) +def run(host, port): + # Serve the app using Uvicorn + uvicorn.run( + "serverragllm_zendesk_csv_sql_local:app", host=host, port=port, reload=True + ) + + +if __name__ == "__main__": + run() diff --git a/dockers/llm.rag.service/setup.py b/dockers/llm.rag.service/setup.py new file mode 100644 index 0000000..e2674ef --- /dev/null +++ b/dockers/llm.rag.service/setup.py @@ -0,0 +1,7 @@ +from setuptools import setup + +setup( + name="serveragllm", + version="0.1.0", + py_modules=["common", "serveragllm", "serverragllm_jira_cvs_local"], +) diff --git a/dockers/llm.vdb.service/.env_local_template b/dockers/llm.vdb.service/.env_local_template new file mode 100644 index 0000000..5f7fefd --- /dev/null +++ b/dockers/llm.vdb.service/.env_local_template @@ -0,0 +1,8 @@ +# Local Settings +LOCAL_DIRECTORY=/path/to/local/storage +OUTPUT_FILENAME=/path/to/local/output_pickled.obj + +# Vector DB Optional Settings +# EMBEDDING_CHUNK_SIZE=1000 +# EMBEDDING_CHUNK_OVERLAP=100 +# EMBEDDING_MODEL_NAME=sentence-transformers/all-MiniLM-L6-v2 \ No newline at end of file diff --git a/dockers/llm.vdb.service/.env_s3_template b/dockers/llm.vdb.service/.env_s3_template new file mode 100644 index 0000000..28e0d4a --- /dev/null +++ b/dockers/llm.vdb.service/.env_s3_template @@ -0,0 +1,12 @@ +# S3 Settings +VECTOR_DB_INPUT_ARG=s3-input-dir +VECTOR_DB_S3_BUCKET=my-s3-bucket +VECTOR_DB_S3_FILE=my-vectordb-file +AWS_REGION=us-east-1 +AWS_ACCESS_KEY_ID=my-access-key +AWS_SECRET_ACCESS_KEY=my-secret-key + +# Vector DB Optional Settings +# EMBEDDING_CHUNK_SIZE=1000 +# EMBEDDING_CHUNK_OVERLAP=100 +# EMBEDDING_MODEL_NAME=sentence-transformers/all-MiniLM-L6-v2 \ No newline at end of file diff --git a/dockers/llm.vdb.service/Dockerfile b/dockers/llm.vdb.service/Dockerfile index bbcfef0..557ad29 100644 --- a/dockers/llm.vdb.service/Dockerfile +++ b/dockers/llm.vdb.service/Dockerfile @@ -1,6 +1,6 @@ -# syntax=docker/dockerfile-upstream:master +# syntax=docker/dockerfile:1 # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile -FROM python:3.11-slim as base-container +FROM python:3.11-slim AS base-container # Automatically set by buildx ARG TARGETPLATFORM @@ -12,18 +12,45 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins ca-certificates \ ccache \ curl \ + libmagic1 \ + file \ libssl-dev ca-certificates make \ git python3-pip && \ rm -rf /var/lib/apt/lists/* WORKDIR /createvectordb +# Upgrade pip +RUN pip3 install --no-cache-dir --upgrade pip setuptools wheel + +# Install dependencies in separate layers +RUN pip3 install --no-cache-dir \ + "langchain" \ + "langchain_community" \ + "langchain_huggingface" + +RUN pip3 install --no-cache-dir \ + "unstructured" \ + "sentence-transformers" \ + "faiss-cpu" + +RUN pip3 install --no-cache-dir \ + "lxml" \ + "bs4" + +RUN pip3 install --no-cache-dir \ + "python-magic" \ + "pydantic_settings" \ + "s3fs" \ + "weaviate-client" \ + "langchain_weaviate" + +COPY __init__.py . COPY createvectordb.py . +COPY common.py . +COPY config.py . +COPY s3_utils.py . +COPY service.py . COPY pyproject.toml . -RUN --mount=type=cache,target=/root/.cache/pip \ - pip3 install -v --no-cache-dir \ - "langchain" "sentence-transformers" "faiss-cpu" "boto3" "lxml" "bs4" && \ - pip3 install --no-cache-dir -e . - CMD ["python", "createvectordb.py"] diff --git a/dockers/llm.vdb.service/README.md b/dockers/llm.vdb.service/README.md new file mode 100644 index 0000000..685bc84 --- /dev/null +++ b/dockers/llm.vdb.service/README.md @@ -0,0 +1,39 @@ +# Create Vector Databas + +## Run locally +Create a venv and install requirements.txt + +Locally vector db creation can be run in two modes: local dir or s3. + +Please see template files +```shell +.env_local_template +.env_s3_template +``` + +To run the app You can either export all required env vars or prepare an .env file and run +```shell +uv run createvectordb.py +or +python createvectordb.py +``` + +Or You can pass the file: +```shell +uv run createvectordb.py --env_file backup-.env +``` + +## Run tests +Also install in your venv requirements-dev.tx and call +```shell +uv run pytest +``` + +## Run in k8s +Using pydantic settings introduces one impediment. Before running +```shell +envsubst < createvdb.yaml | kubectl apply -f - +``` +We must make sure all env variables are exported. If for ex EMBEDDING_CHUNK_SIZE is not set in our terminal, +envsubst will put an empty string there and pydantic settings will complain that they can't change +empty setting to integer. \ No newline at end of file diff --git a/dockers/llm.vdb.service/__init__.py b/dockers/llm.vdb.service/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dockers/llm.vdb.service/check_files_in_s3_bucket.py b/dockers/llm.vdb.service/check_files_in_s3_bucket.py new file mode 100644 index 0000000..cfb3ec0 --- /dev/null +++ b/dockers/llm.vdb.service/check_files_in_s3_bucket.py @@ -0,0 +1,52 @@ +import boto3 +from botocore.exceptions import ClientError +import argparse +from typing import List, Optional +from createvectordb import list_files_in_s3_folder + +def main(): + """ + Main function to run the S3 file listing script. + Accepts command line arguments for bucket name and folder prefix. + """ + # Set up argument parser + parser = argparse.ArgumentParser(description='List all files in an S3 bucket folder with pagination') + parser.add_argument('--bucket', required=True, help='Name of the S3 bucket') + parser.add_argument('--folder', required=True, help='Folder prefix in the bucket') + parser.add_argument('--profile', help='AWS profile name (optional)', default=None) + parser.add_argument('--region', help='AWS region (optional)', default='us-east-1') + + args = parser.parse_args() + + try: + # Set up AWS session with optional profile + if args.profile: + session = boto3.Session(profile_name=args.profile, region_name=args.region) + else: + session = boto3.Session(region_name=args.region) + + # Create S3 client + s3_client = session.client('s3') + + # Call the listing function + files = list_files_in_s3_folder(args.bucket, args.folder, s3_client) + + # Print summary and sample results + if files: + print(f"Summary of files in {args.bucket}/{args.folder}:") + print(f"Total files found: {len(files)}") + print("\nFirst 5 files:") + for file in files[:5]: + print(file) + + print(f"\nLast 5 files:") + for file in files[-5:]: + print(file) + + print(f"\nFinal count confirmation: {len(files)} files found") + + except Exception as e: + print(f"Error: {str(e)}") + +if __name__ == "__main__": + main() diff --git a/dockers/llm.vdb.service/common.py b/dockers/llm.vdb.service/common.py new file mode 100644 index 0000000..8e86776 --- /dev/null +++ b/dockers/llm.vdb.service/common.py @@ -0,0 +1,191 @@ +import json +import os +from typing import List + +import weaviate + +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.vectorstores import FAISS +from langchain_core.documents import Document +from langchain_huggingface import HuggingFaceEmbeddings +from langchain_weaviate.vectorstores import WeaviateVectorStore + + +def load_jsonl_files_from_directory(directory): + data = [] + for filename in os.listdir(directory): + print("Processing file, ", filename, "..." ) + if filename.endswith('.json'): + with open(os.path.join(directory, filename)) as f: + try: + # Try reading as JSONL first + for line in f: + if line.strip(): # Skip empty lines + data.append(json.loads(line.strip())) + except json.JSONDecodeError: + # If that fails, try reading as regular JSON + f.seek(0) # Go back to start of file + data.append(json.load(f)) + return data + + +def get_documents(data): + texts = [doc["text"] for doc in data] + metadatas = [doc["metadata"] for doc in data] + return texts, metadatas + + +def chunk_documents(data, chunk_size, chunk_overlap): + """ + Chunks documents while maintaining alignment between text chunks and metadata + """ + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) + + # Lists to store chunks and corresponding metadata + all_chunks = [] + all_metadatas = [] + + for doc in data: + print("Chunking doc with key/ticket ID, ", doc["metadata"].get("ticket") or doc["metadata"].get("key")) + chunks = text_splitter.split_text(doc["text"]) + + doc_metadatas = [doc["metadata"].copy() for _ in chunks] + + # This is just to see if it's used or not + for i, (chunk, metadata) in enumerate(zip(chunks, doc_metadatas)): + metadata["chunk_index"] = i + metadata["chunk_total"] = len(chunks) + + all_chunks.extend(chunks) + all_metadatas.extend(doc_metadatas) + + print("Number of chunks created: ", len(all_chunks)) + return all_chunks, all_metadatas + + +def chunk_documents_with_added_metadata(data, chunk_size, chunk_overlap): + """ + Splits documents into smaller text chunks while preserving alignment with metadata. + Additionally, each chunk is enriched by embedding its corresponding metadata into the text. + + The method determines the maximum metadata size across all documents and adjusts + the effective chunk size accordingly to ensure that metadata fits within each chunk. + + Metadata keys with `None` values are excluded from the embedded metadata in the text. + """ + # TODO: find a better way to find the biggest metadata + max_size_of_metadata = 0 + for doc in data: + meta_enhancement = ". ".join([f"{key.title()}: {value}" for key, value in doc["metadata"].items() if value]) + max_size_of_metadata = max(max_size_of_metadata, len(meta_enhancement)) + + print(f"Biggest metada has {max_size_of_metadata} characters.") + effective_chunk_size = chunk_size - max_size_of_metadata + print(f"Effective chunk size will be {effective_chunk_size}") + + # TODO: handle better + if effective_chunk_size <= 0: + raise "Use bigger chunk size" + + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) + + # Lists to store chunks and corresponding metadata + all_chunks = [] + all_metadatas = [] + + for doc in data: + print("Chunking doc with key/ticket ID, ", doc["metadata"].get("ticket") or doc["metadata"].get("key")) + chunks = text_splitter.split_text(doc["text"]) + chunks_enriched_with_metadata = [] + + doc_metadatas = [doc["metadata"].copy() for _ in chunks] + + meta_enhancement = ". ".join([f"{key.title()}: {value}" for key, value in doc["metadata"].items() if value]) + + for i, (chunk, metadata) in enumerate(zip(chunks, doc_metadatas)): + chunks_enriched_with_metadata.append(chunk + "\n" + meta_enhancement) + + # This is just to see if it's used or not + metadata["chunk_index"] = i + metadata["chunk_total"] = len(chunks) + + all_chunks.extend(chunks_enriched_with_metadata) + all_metadatas.extend(doc_metadatas) + + print("Number of chunks created: ", len(all_chunks)) + return all_chunks, all_metadatas + + +def create_vectordb_from_data( + data, + embedding_model_name: str, + chunk_size: int, + chunk_overlap: int, +): + # no chunking + # texts, metadatas = get_documents(data) + + # with chunking texts + # texts, metadatas = chunk_documents(data, chunk_size, chunk_overlap) + + # with adding metadata to text + print("Start chunking documents") + texts, metadatas = chunk_documents_with_added_metadata(data, chunk_size, chunk_overlap) + + embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name) + print("Convert to FAISS vectorstore") + vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas) + return vectorstore + + +def create_vectordb_local_weaviate( + data, + embedding_model_name: str, + chunk_size: int, + chunk_overlap: int, + weaviate_url: str, + weaviate_grpc_url: str, + weaviate_index_name: str, +): + # with adding metadata to text + print("Start chunking documents") + texts, metadatas = chunk_documents_with_added_metadata(data, chunk_size, chunk_overlap) + + embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name) + + # adapt data + documents: List[Document] = [] + for txt, met in zip(texts, metadatas): + document = Document( + page_content=txt, + metadata=met + ) + documents.append(document) + + # TODO: move extracting url and port to config + with weaviate.connect_to_custom( + http_host=weaviate_url.split(":")[0], + http_port=int(weaviate_url.split(":")[1]), + http_secure=False, + grpc_host=weaviate_grpc_url.split(":")[0], + grpc_port=int(weaviate_grpc_url.split(":")[1]), + grpc_secure=False, + ) as weaviate_client: + # return WeaviateVectorStore.from_documents( + # documents, + # embeddings, + # client=weaviate_client, + # index_name=weaviate_index_name, + # ) + return WeaviateVectorStore.from_texts( + texts, + embeddings, + client=weaviate_client, + metadatas=metadatas, + index_name=weaviate_index_name, + text_key="text", + ) diff --git a/dockers/llm.vdb.service/config.py b/dockers/llm.vdb.service/config.py new file mode 100644 index 0000000..f1df796 --- /dev/null +++ b/dockers/llm.vdb.service/config.py @@ -0,0 +1,161 @@ +import os + +from pydantic_settings import BaseSettings +from pydantic import Field, ValidationError, field_validator + +from typing import Optional + + +def validate_int(value): + if type(value) == int: + return value + try: + return int(value.strip("'").strip("\"")) + except (TypeError, ValueError): + raise ValueError("Value must be convertible to an integer") + + +class S3Settings(BaseSettings): + # This is the directory name on S3 where input files can be found + s3_dir_name: Optional[str] = Field( + alias="VECTOR_DB_INPUT_ARG", + description="Name of the S3 directory" + ) + + # This is the bucket that will be used to store both input datasets for RAG + # as well as the Vector DB created from this dataset + s3_bucket_name: Optional[str] = Field( + alias="VECTOR_DB_S3_BUCKET", + description="Name of the S3 bucket" + ) + + # This is the name of the Vector DB file that will be created by this script + vectordb_name: Optional[str] = Field( + alias="VECTOR_DB_S3_FILE", + description="Name of the created Vector DB" + ) + + # AWS ccredentials + s3_region: Optional[str] = Field( + None, + alias="AWS_REGION", + description="Region of the S3 bucket" + ) + s3_access_key: Optional[str] = Field( + alias="AWS_ACCESS_KEY_ID", + description="Access key for S3" + ) + s3_secret_key: Optional[str] = Field( + alias="AWS_SECRET_ACCESS_KEY", + description="Secret key for S3" + ) + + embedding_chunk_size: int = Field( + default=1000, + description="Chunk size used by the embedding model" + ) + + @field_validator('embedding_chunk_size', mode='before') + @classmethod + def validate_chunk_size(cls, v): + return validate_int(v) + + embedding_chunk_overlap: int = Field( + default=100, + description="Overlap size between chunks" + ) + + @field_validator('embedding_chunk_overlap', mode='before') + @classmethod + def validate_chunk_overlap(cls, v): + return validate_int(v) + + embedding_model_name: str = Field( + default="sentence-transformers/all-MiniLM-L6-v2", + description="Name of the embedding model to use" + ) + + class Config: + env_file = ".env" + extra = "ignore" + + +class LocalSettings(BaseSettings): + local_directory: str = Field( + description="Directory path for local storage", + ) + + output_filename: str = Field( + description="Output vectordb filename", + ) + + embedding_chunk_size: int = Field( + default=1000, + description="Chunk size used by the embedding model" + ) + embedding_chunk_overlap: int = Field( + default=100, + description="Overlap size between chunks" + ) + embedding_model_name: str = Field( + default="sentence-transformers/all-MiniLM-L6-v2", + description="Name of the embedding model to use" + ) + + class Config: + env_file = ".env" + extra = "ignore" + + +def try_load_settings(env_file): + if env_file: + try: + s3_settings = S3Settings(_env_file=env_file) + return s3_settings, None + except ValidationError as e: + print("ValidationError: ", e) + try: + local_settings = LocalSettings(_env_file=env_file) + return None, local_settings + except ValidationError as e: + raise ValueError(f"Missing or invalid configuration: {e}") + + try: + s3_settings = S3Settings() + return s3_settings, None + except ValidationError as e: + print("ValidationError: ", e) + try: + local_settings = LocalSettings() + return None, local_settings + except ValidationError as e: + raise ValueError(f"Missing or invalid configuration: {e}") + + +class WeaviateSettings(BaseSettings): + weaviate_uri: Optional[str] = Field( + ..., + alias="WEAVIATE_URI_WITH_PORT", + ) + weaviate_grpc_uri: Optional[str] = Field( + ..., + alias="WEAVIATE_GRPC_URI_WITH_PORT", + ) + weaviate_index_name: Optional[str] = Field( + ..., + alias="WEAVIATE_INDEX_NAME", + ) + + class Config: + env_file = ".env" + extra = "ignore" + + def is_set(self) -> bool: + return all([self.weaviate_uri, self.weaviate_grpc_uri, self.weaviate_index_name]) + + +def try_load_weaviate_settings(env_file): + if env_file: + return WeaviateSettings(_env_file=env_file) + else: + return WeaviateSettings() diff --git a/dockers/llm.vdb.service/createvectordb.py b/dockers/llm.vdb.service/createvectordb.py index 3ba5c3a..64ca625 100644 --- a/dockers/llm.vdb.service/createvectordb.py +++ b/dockers/llm.vdb.service/createvectordb.py @@ -1,55 +1,55 @@ -import os +import click +import logging import sys -from langchain_community.embeddings import HuggingFaceEmbeddings -from langchain_community.vectorstores import FAISS -from langchain_community.document_loaders.sitemap import SitemapLoader -from langchain.text_splitter import RecursiveCharacterTextSplitter - -import boto3 -import pickle - -vectordb_bucket = "faiss-vectordbs" - -vectordb_key = os.environ.get('VECTOR_DB_S3_FILE') -if vectordb_key is None: - print("Please set environment variable VECTOR_DB_S3_FILE") - sys.exit(1) - -vectordb_input_type = os.environ.get('VECTOR_DB_INPUT_TYPE') -if vectordb_input_type is None: - print("Please set environment variable VECTOR_DB_INPUT_TYPE") - sys.exit(1) - -vectordb_input_arg = os.environ.get('VECTOR_DB_INPUT_ARG') -if vectordb_input_arg is None: - print("Please set environment variable VECTOR_DB_INPUT_ARG") - sys.exit(1) - -# Initialize vectorstore and create pickle representation -os.environ["TOKENIZERS_PARALLELISM"] = "false" -if vectordb_input_type == 'text': - vectorstore = FAISS.from_texts(vectordb_input_arg, embedding=HuggingFaceEmbeddings()) -elif vectordb_input_type == 'sitemap': - sitemap_loader = SitemapLoader(web_path=vectordb_input_arg, filter_urls=["^((?!.*/v.*).)*$"]) - sitemap_loader.requests_per_second = 1 - docs = sitemap_loader.load() - print("Count of sitemap docs loaded:", len(docs)) - text_splitter = RecursiveCharacterTextSplitter( - chunk_size = 1000, - chunk_overlap = 100, - length_function = len, - ) - texts = text_splitter.split_documents(docs) - vectorstore = FAISS.from_documents(texts, embedding=HuggingFaceEmbeddings()) -else: - print("Unknown value for VECTOR_DB_INPUT_TYPE:", vectordb_input_type) - sys.exit(1) - -pickle_byte_obj = pickle.dumps(vectorstore) - -# Persist vectorstore to S3 bucket vectorstores -s3_client = boto3.client('s3') -s3_client.put_object(Body=pickle_byte_obj, Bucket=vectordb_bucket, Key=vectordb_key) -print("Uploaded vectordb to", vectordb_bucket, vectordb_key) -sys.exit(0) +from config import ( + try_load_settings, + try_load_weaviate_settings, +) +from service import ( + LocalDirDbCreationService, + LocalDirWeaviateDbCreationService, + S3WeaviateDbCreationService, + S3VectorDbCreationService, +) + + +logging.basicConfig() +logging.getLogger().setLevel(logging.INFO) + + +@click.command() +@click.option("--env_file", type=click.Path(exists=True), help="Path to the environment file") +def run(env_file: str): + s3_settings, local_settings = try_load_settings(env_file) + weaviate_settings = try_load_weaviate_settings(env_file) + + if s3_settings: + if weaviate_settings.is_set(): + logging.info("---> S3WeaviateDbCreationService") + service = S3WeaviateDbCreationService(s3_settings, weaviate_settings) + else: + logging.info("---> S3VectorDbCreationService") + service = S3VectorDbCreationService(s3_settings) + + service.create() + + elif local_settings: + if weaviate_settings.is_set(): + logging.info("---> LocalDirWeaviateDbCreationService") + service = LocalDirWeaviateDbCreationService(local_settings, weaviate_settings) + else: + logging.info("---> S3VectorDbCreationService") + service = S3VectorDbCreationService(local_settings) + + service.create() + + else: + # TODO: not really needed, error will be thrown earlier + raise "Missing config" + + sys.exit(0) + + +if __name__ == "__main__": + run() diff --git a/dockers/llm.vdb.service/createvectordb_test.py b/dockers/llm.vdb.service/createvectordb_test.py new file mode 100644 index 0000000..7db0f1f --- /dev/null +++ b/dockers/llm.vdb.service/createvectordb_test.py @@ -0,0 +1,85 @@ +import click +import os +import pytest +import s3fs + +from botocore.session import Session +from moto.moto_server.threaded_moto_server import ThreadedMotoServer +from s3fs.core import S3FileSystem + +from createvectordb import run + + +def test_create_faiss_vector_db_using_local_files(): + ctx = click.Context(run) + try: + ctx.forward(run, env_file="test_data/.env_local") + except SystemExit as e: + assert e.code == 0 + + assert os.path.exists("test_data/output/output_pickled.obj") + + if os.path.exists("test_data/output/output_pickled.obj"): + os.remove("test_data/output/output_pickled.obj") + + +@pytest.fixture(scope="module") +def s3_base(): + # writable local S3 system + server = ThreadedMotoServer(ip_address="127.0.0.1", port=5555) + server.start() + os.environ["AWS_SECRET_ACCESS_KEY"] = "test-secret-key" + os.environ["AWS_ACCESS_KEY_ID"] = "test-access-key" + os.environ["TEST_FAKE_S3"] = "true" + os.environ.pop("AWS_PROFILE", None) + + print("server up") + yield + print("moto done") + server.stop() + + +def upload_to_s3(s3_client, bucket_name, local_dir, s3_prefix): + for root, _, files in os.walk(local_dir): + for file in files: + local_file_path = os.path.join(root, file) + s3_key = os.path.join(s3_prefix, os.path.relpath(local_file_path, local_dir)).replace("\\", "/") + with open(local_file_path, "rb") as f: + s3_client.put_object(Bucket=bucket_name, Key=s3_key, Body=f) + + +MOCK_BUCKET_NAME = "test_bucket" + + +@pytest.fixture() +def mock_s3_client(s3_base): + session = Session() + client = session.create_client("s3", endpoint_url="http://127.0.0.1:5555/") + client.create_bucket( + Bucket=MOCK_BUCKET_NAME, + ACL="public-read", + CreateBucketConfiguration={ + 'LocationConstraint': "us-east-2", # TODO: make sure this is the same as local default + }, + ) + + S3FileSystem.clear_instance_cache() + s3 = S3FileSystem(anon=False, client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) + s3.invalidate_cache() + + yield client + + +def test_create_faiss_vector_db_using_s3_files(mock_s3_client): + upload_to_s3(mock_s3_client, MOCK_BUCKET_NAME, "test_data/input", "s3-input-dir") + + ctx = click.Context(run) + try: + ctx.forward(run, env_file="test_data/.env_s3") + except SystemExit as e: + assert e.code == 0 + + s3 = s3fs.S3FileSystem(anon=False, client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) + output_s3_path = "s3://test_bucket/test_data/output_pickled.obj" + + assert s3.exists(output_s3_path) diff --git a/dockers/llm.vdb.service/makeDocker.sh b/dockers/llm.vdb.service/makeDocker.sh index 53fd726..d93a476 100755 --- a/dockers/llm.vdb.service/makeDocker.sh +++ b/dockers/llm.vdb.service/makeDocker.sh @@ -6,7 +6,14 @@ set -e CREATE_VECTOR_DB_REPO=$1 CREATE_VECTOR_DB_TAG=$2 +# Get the directory where the script is located +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )" + echo "" echo "Building docker for vectordb creation" -docker build --platform=linux/amd64 --load -f ./Dockerfile -t ${CREATE_VECTOR_DB_REPO}:${CREATE_VECTOR_DB_TAG} . +docker buildx build --platform=linux/amd64 --load \ + -f "${SCRIPT_DIR}/Dockerfile" \ + -t ${CREATE_VECTOR_DB_REPO}:${CREATE_VECTOR_DB_TAG} \ + "${SCRIPT_DIR}" + docker push ${CREATE_VECTOR_DB_REPO}:${CREATE_VECTOR_DB_TAG} diff --git a/dockers/llm.vdb.service/pyproject.toml b/dockers/llm.vdb.service/pyproject.toml index 148c485..8969a57 100644 --- a/dockers/llm.vdb.service/pyproject.toml +++ b/dockers/llm.vdb.service/pyproject.toml @@ -3,3 +3,6 @@ name = "createvectordb" # Required version = "1.0.0" # Required description = "Create FAISS vectordb, serialize and upload to S3" requires-python = ">=3.8" + +[tool.setuptools] +py-modules = ["common", "config", "createvectordb", "s3_utils", "service"] \ No newline at end of file diff --git a/dockers/llm.vdb.service/requirements-dev.txt b/dockers/llm.vdb.service/requirements-dev.txt new file mode 100644 index 0000000..9c09313 --- /dev/null +++ b/dockers/llm.vdb.service/requirements-dev.txt @@ -0,0 +1,5 @@ +boto3 +flask +flask_cors +moto +pytest \ No newline at end of file diff --git a/dockers/llm.vdb.service/requirements.txt b/dockers/llm.vdb.service/requirements.txt new file mode 100644 index 0000000..f04e044 --- /dev/null +++ b/dockers/llm.vdb.service/requirements.txt @@ -0,0 +1,8 @@ +click +faiss-cpu +langchain_community +langchain_huggingface +langchain_weaviate +pydantic_settings +s3fs +weaviate-client \ No newline at end of file diff --git a/dockers/llm.vdb.service/s3_utils.py b/dockers/llm.vdb.service/s3_utils.py new file mode 100644 index 0000000..92ce510 --- /dev/null +++ b/dockers/llm.vdb.service/s3_utils.py @@ -0,0 +1,50 @@ +import json +import os + +import s3fs + + +def load_jsonl_files_from_s3(bucket_name, prefix=""): + # TODO: init client with data from config + if os.environ.get("TEST_FAKE_S3") == "true": + fs = s3fs.S3FileSystem(anon=False, client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) + else: + fs = s3fs.S3FileSystem() + data = [] + + # List all files under the given prefix + files = fs.ls(f"{bucket_name}/{prefix}") + + for file_path in files: + print("Processing file:", file_path, "...") + if file_path.endswith('.json'): + with fs.open(file_path, 'r') as f: + try: + # Try reading as JSONL first + for line in f: + if line.strip(): # Skip empty lines + data.append(json.loads(line.strip())) + except json.JSONDecodeError: + # If that fails, try reading as regular JSON + f.seek(0) # Go back to start of file + data.append(json.load(f)) + + return data + + +def save_file_to_s3(file_to_upload, bucket, key): + # TODO: init client with data from config + if os.environ.get("TEST_FAKE_S3") == "true": + fs = s3fs.S3FileSystem(anon=False, client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) + else: + fs = s3fs.S3FileSystem() + s3_path = f"{bucket}/{key}" + + with fs.open(s3_path, 'wb') as s3_file: + if isinstance(file_to_upload, str): + # If it's a file path, read and upload the file content + with open(file_to_upload, 'rb') as local_file: + s3_file.write(local_file.read()) + else: + # If it's in-memory content (bytes or string), upload directly + s3_file.write(file_to_upload if isinstance(file_to_upload, bytes) else file_to_upload.encode()) diff --git a/dockers/llm.vdb.service/service.py b/dockers/llm.vdb.service/service.py new file mode 100644 index 0000000..25c4fad --- /dev/null +++ b/dockers/llm.vdb.service/service.py @@ -0,0 +1,115 @@ +import os +import pickle + +from dataclasses import dataclass + +from common import ( + create_vectordb_local_weaviate, + create_vectordb_from_data, + load_jsonl_files_from_directory, +) +from config import LocalSettings, S3Settings, WeaviateSettings +from s3_utils import load_jsonl_files_from_s3, save_file_to_s3 + + +@dataclass +class S3VectorDbCreationService: + config: S3Settings + + def create(self): + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + os.environ["AWS_ACCESS_KEY_ID"] = self.config.s3_access_key + os.environ["AWS_SECRET_ACCESS_KEY"] = self.config.s3_secret_key + + print("Load JSON files") + data = load_jsonl_files_from_s3(self.config.s3_bucket_name, self.config.s3_dir_name) + + print("Convert to FAISS vectorstore") + vectorstore = create_vectordb_from_data( + data, + self.config.embedding_model_name, + self.config.embedding_chunk_size, + self.config.embedding_chunk_overlap, + ) + + pickle_byte_obj = pickle.dumps(vectorstore) + + save_file_to_s3(pickle_byte_obj, self.config.s3_bucket_name, self.config.vectordb_name) + print("Uploaded vectordb to", self.config.s3_bucket_name, self.config.vectordb_name) + + +@dataclass +class LocalDirDbCreationService: + config: LocalSettings + + def create(self): + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + print("Load JSON files") + data = load_jsonl_files_from_directory(self.config.local_directory) + + print("Convert to FAISS vectorstore") + vectorstore = create_vectordb_from_data( + data, + self.config.embedding_model_name, + self.config.embedding_chunk_size, + self.config.embedding_chunk_overlap, + ) + + pickle_byte_obj = pickle.dumps(vectorstore) + + with open(self.config.output_filename, "wb") as file: + file.write(pickle_byte_obj) + print(f"Pickle byte object saved to {self.config.output_filename}") + + +@dataclass +class LocalDirWeaviateDbCreationService: + config: LocalSettings + db_config: WeaviateSettings + + def create(self): + print("Load JSON files") + data = load_jsonl_files_from_directory(self.config.local_directory) + + print("Convert to Weaviate vectorstore") + create_vectordb_local_weaviate( + data, + self.config.embedding_model_name, + self.config.embedding_chunk_size, + self.config.embedding_chunk_overlap, + self.db_config.weaviate_uri, + self.db_config.weaviate_grpc_uri, + self.db_config.weaviate_index_name, + ) + + print(f"Weaviate index saved {self.db_config.weaviate_index_name}") + + +@dataclass +class S3WeaviateDbCreationService: + config: S3Settings + db_config: WeaviateSettings + + def create(self): + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + os.environ["AWS_ACCESS_KEY_ID"] = self.config.s3_access_key + os.environ["AWS_SECRET_ACCESS_KEY"] = self.config.s3_secret_key + + print("Load JSON files") + data = load_jsonl_files_from_s3(self.config.s3_bucket_name, self.config.s3_dir_name) + + print("Convert to Weaviate vectorstore") + create_vectordb_local_weaviate( + data, + self.config.embedding_model_name, + self.config.embedding_chunk_size, + self.config.embedding_chunk_overlap, + self.db_config.weaviate_uri, + self.db_config.weaviate_grpc_uri, + self.db_config.weaviate_index_name, + ) + + print(f"Weaviate index saved {self.db_config.weaviate_index_name}") diff --git a/dockers/llm.vdb.service/setup.py b/dockers/llm.vdb.service/setup.py new file mode 100644 index 0000000..65465a7 --- /dev/null +++ b/dockers/llm.vdb.service/setup.py @@ -0,0 +1,7 @@ +from setuptools import setup + +setup( + name="createvectordb", + version="0.1.0", + py_modules=["common", "config", "createvectordb", "s3_utils", "service"], +) \ No newline at end of file diff --git a/dockers/llm.vdb.service/test_data/.env_local b/dockers/llm.vdb.service/test_data/.env_local new file mode 100644 index 0000000..e625831 --- /dev/null +++ b/dockers/llm.vdb.service/test_data/.env_local @@ -0,0 +1,8 @@ +# Local Settings +LOCAL_DIRECTORY=test_data/input/ +OUTPUT_FILENAME=test_data/output/output_pickled.obj + +# Vector DB Optional Settings +# EMBEDDING_CHUNK_SIZE=1000 +# EMBEDDING_CHUNK_OVERLAP=100 +# EMBEDDING_MODEL_NAME=sentence-transformers/all-MiniLM-L6-v2 \ No newline at end of file diff --git a/dockers/llm.vdb.service/test_data/.env_s3 b/dockers/llm.vdb.service/test_data/.env_s3 new file mode 100644 index 0000000..33b004b --- /dev/null +++ b/dockers/llm.vdb.service/test_data/.env_s3 @@ -0,0 +1,13 @@ +# S3 Settings +VECTOR_DB_INPUT_ARG=s3-input-dir +VECTOR_DB_S3_BUCKET=test_bucket +VECTOR_DB_S3_FILE=test_data/output_pickled.obj +AWS_ACCESS_KEY_ID=test-access-key +AWS_SECRET_ACCESS_KEY=test-secret-key +AWS_REGION=us-east-2 + + +# Vector DB Optional Settings +# EMBEDDING_CHUNK_SIZE=1000 +# EMBEDDING_CHUNK_OVERLAP=100 +# EMBEDDING_MODEL_NAME=sentence-transformers/all-MiniLM-L6-v2 \ No newline at end of file diff --git a/dockers/llm.vdb.service/test_data/input/file1.json b/dockers/llm.vdb.service/test_data/input/file1.json new file mode 100644 index 0000000..b9a9053 --- /dev/null +++ b/dockers/llm.vdb.service/test_data/input/file1.json @@ -0,0 +1,9 @@ +{ + "text": "title: Split configuration for each cloud provider into a separate subsection\ndescription: \nstatus: To Do\ntype: Improvement\npriority: Medium\ncomponents: APP", + "metadata": { + "ticket": "ENG-915", + "type": "Improvement", + "status": "To Do", + "source": "https://example.atlassian.net/browse/ENG-915" + } +} diff --git a/dockers/llm.vdb.service/test_data/input/file2.json b/dockers/llm.vdb.service/test_data/input/file2.json new file mode 100644 index 0000000..e2ebf71 --- /dev/null +++ b/dockers/llm.vdb.service/test_data/input/file2.json @@ -0,0 +1,9 @@ +{ + "text": "title: Implement caching mechanism for cloud resource queries\ndescription: Develop a robust caching strategy to reduce API calls\nstatus: In Progress\ntype: Feature\npriority: High\ncomponents: Core Infrastructure", + "metadata": { + "ticket": "ENG-916", + "type": "Feature", + "status": "In Progress", + "source": "https://example.atlassian.net/browse/ENG-916" + } +} \ No newline at end of file diff --git a/dockers/llm.vdb.service/test_data/input/file3.json b/dockers/llm.vdb.service/test_data/input/file3.json new file mode 100644 index 0000000..ceddfca --- /dev/null +++ b/dockers/llm.vdb.service/test_data/input/file3.json @@ -0,0 +1,9 @@ +{ + "text": "title: Add comprehensive error logging for network operations\ndescription: Enhance error tracking and diagnostic capabilities\nstatus: Backlog\ntype: Technical Debt\npriority: Low\ncomponents: Networking", + "metadata": { + "ticket": "ENG-917", + "type": "Technical Debt", + "status": "Backlog", + "source": "https://example.atlassian.net/browse/ENG-917" + } +} \ No newline at end of file diff --git a/dockers/llm.vdb.service/test_data/input/file4.json b/dockers/llm.vdb.service/test_data/input/file4.json new file mode 100644 index 0000000..c6ed1d4 --- /dev/null +++ b/dockers/llm.vdb.service/test_data/input/file4.json @@ -0,0 +1,9 @@ +{ + "text": "title: Optimize database query performance\ndescription: Analyze and improve slow database queries\nstatus: Review\ntype: Performance\npriority: High\ncomponents: Database", + "metadata": { + "ticket": "ENG-918", + "type": "Performance", + "status": "Review", + "source": "https://example.atlassian.net/browse/ENG-918" + } +} \ No newline at end of file diff --git a/dockers/llm.vdb.service/test_data/input/file5.json b/dockers/llm.vdb.service/test_data/input/file5.json new file mode 100644 index 0000000..4bd3ae2 --- /dev/null +++ b/dockers/llm.vdb.service/test_data/input/file5.json @@ -0,0 +1,9 @@ +{ + "text": "title: Implement multi-factor authentication\ndescription: Add additional security layer for user access\nstatus: Planning\ntype: Security\npriority: Critical\ncomponents: Authentication", + "metadata": { + "ticket": "ENG-919", + "type": "Security", + "status": "Planning", + "source": "https://example.atlassian.net/browse/ENG-919" + } +} \ No newline at end of file diff --git a/docs/csv_to_json_steps.md b/docs/csv_to_json_steps.md new file mode 100644 index 0000000..5836973 --- /dev/null +++ b/docs/csv_to_json_steps.md @@ -0,0 +1,40 @@ +# Process CSV to json and pass to RAG app + +## Preparing the data +Go to `GenAI-infra-stack/scripts` create venv and install deps +```shell +uv venv +source .venv/bin/activate +uv pip install -r requirements.txt +``` + +set jira_url in `jira_config.ini` and run + +```shell +uv run process_jira_tickets.py jira_elotl.csv jira_config.ini output_files +``` + +upload these files instead of the wiki docs + +## Vector store creation +Run vector store creation with +```shell +export VECTOR_DB_INPUT_ARG="json-format" +``` + +## Rag app +Run rag service with this extra setting +```shell +export IS_JSON_MODE="True" +``` + +## Chat UI app +Run chat UI with the same export MODEL_NAMESPACE=... as rag service: +```shell +envsubst < simple-chat.yaml | kubectl apply -f - +``` + +and port forward to use it: +```shell +kubectl port-forward svc/simple-chat-service 7860:7860 +``` diff --git a/docs/diagrams/elotl_genai_stack_enduser.excalidraw b/docs/diagrams/elotl_genai_stack_enduser.excalidraw new file mode 100644 index 0000000..c5fd3c4 --- /dev/null +++ b/docs/diagrams/elotl_genai_stack_enduser.excalidraw @@ -0,0 +1,1147 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://excalidraw.com", + "elements": [ + { + "type": "image", + "version": 1072, + "versionNonce": 784086144, + "index": "Zz", + "isDeleted": false, + "id": "sozWfO1Cl02Q7oTG5RNWh", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 451.8316784237401, + "y": 1667.3962469138542, + "strokeColor": "transparent", + "backgroundColor": "#b2f2bb", + "width": 247.10747663551373, + "height": 172.81372549019588, + "seed": 710464640, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [ + { + "id": "2Z3sghzybH_XPDwQUq20R", + "type": "arrow" + }, + { + "id": "_AKGA6fa9DRXWg43H0wKz", + "type": "arrow" + } + ], + "updated": 1737489800480, + "link": null, + "locked": false, + "status": "saved", + "fileId": "dc64b0bc089c8f4c09cdb44d37f60a3d2a4c585dd71cd35ef458bbb2bb5bf63645c1db8bb06830860ea84eff4450b570", + "scale": [ + 1, + 1 + ], + "crop": null + }, + { + "id": "rWUWmoYfLLz-IfoYFppk9", + "type": "rectangle", + "x": 497.3854167414968, + "y": 1404.0104431156992, + "width": 149, + "height": 117, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0A", + "roundness": { + "type": 3 + }, + "seed": 1782416256, + "version": 140, + "versionNonce": 2074882944, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "IySF2gOY4M4SGX2FEX7pf" + }, + { + "id": "lh9O6aJ6Au8J3uoJtj-1u", + "type": "arrow" + }, + { + "id": "Z-RaBCcdD6LWm3f2x-rZh", + "type": "arrow" + }, + { + "id": "2Z3sghzybH_XPDwQUq20R", + "type": "arrow" + }, + { + "id": "_AKGA6fa9DRXWg43H0wKz", + "type": "arrow" + }, + { + "id": "kDIWGhb8HsDs-LQfJ2uX8", + "type": "arrow" + }, + { + "id": "fawA_cGHh8QB7OA4lgEzM", + "type": "arrow" + } + ], + "updated": 1737489613214, + "link": null, + "locked": false + }, + { + "id": "IySF2gOY4M4SGX2FEX7pf", + "type": "text", + "x": 523.9847498529418, + "y": 1427.5104431156992, + "width": 95.80133377710978, + "height": 70, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffec99", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0B", + "roundness": null, + "seed": 879772800, + "version": 128, + "versionNonce": 571954304, + "isDeleted": false, + "boundElements": null, + "updated": 1737488271083, + "link": null, + "locked": false, + "text": "RAG\nService", + "fontSize": 28, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "rWUWmoYfLLz-IfoYFppk9", + "originalText": "RAG Service", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "hgJc4rbHtodgVuvQuEZ0i", + "type": "rectangle", + "x": 878.8854167414968, + "y": 1385.5104431156992, + "width": 163.99999999999991, + "height": 130.99999999999997, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0C", + "roundness": { + "type": 3 + }, + "seed": 2096751744, + "version": 775, + "versionNonce": 1127658368, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "uY7qmYvBRIT3lhwMMSwlL" + }, + { + "id": "fawA_cGHh8QB7OA4lgEzM", + "type": "arrow" + }, + { + "id": "Z-RaBCcdD6LWm3f2x-rZh", + "type": "arrow" + } + ], + "updated": 1737489687323, + "link": null, + "locked": false + }, + { + "id": "uY7qmYvBRIT3lhwMMSwlL", + "type": "text", + "x": 898.5490831805531, + "y": 1398.5104431156992, + "width": 124.6726671218872, + "height": 105, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffec99", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0D", + "roundness": null, + "seed": 2101957760, + "version": 778, + "versionNonce": 117400448, + "isDeleted": false, + "boundElements": [], + "updated": 1737489601514, + "link": null, + "locked": false, + "text": "Large\nLanguage\nModel", + "fontSize": 28, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "hgJc4rbHtodgVuvQuEZ0i", + "originalText": "Large Language Model", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "kFYXswz1jL28PfsTtI6MJ", + "type": "text", + "x": 418.3854167414968, + "y": 1630.0104431156992, + "width": 127.99999999999994, + "height": 34.99999999999997, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0J", + "roundness": null, + "seed": 488105088, + "version": 334, + "versionNonce": 919778176, + "isDeleted": false, + "boundElements": [ + { + "id": "_AKGA6fa9DRXWg43H0wKz", + "type": "arrow" + } + ], + "updated": 1737489133006, + "link": null, + "locked": false, + "text": "Question", + "fontSize": 27.99999999999998, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Question", + "autoResize": false, + "lineHeight": 1.25 + }, + { + "id": "qsuSWNRWwqSyglkAiT1-U", + "type": "rectangle", + "x": 110.38541674149678, + "y": 1385.0104431156992, + "width": 142, + "height": 134, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0L", + "roundness": { + "type": 3 + }, + "seed": 608039808, + "version": 551, + "versionNonce": 463561600, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "uRYBPOPRWEMN3GkDBswvi" + }, + { + "id": "kDIWGhb8HsDs-LQfJ2uX8", + "type": "arrow" + }, + { + "id": "lh9O6aJ6Au8J3uoJtj-1u", + "type": "arrow" + } + ], + "updated": 1737489639450, + "link": null, + "locked": false + }, + { + "id": "uRYBPOPRWEMN3GkDBswvi", + "type": "text", + "x": 136.4820831093455, + "y": 1417.0104431156992, + "width": 89.80666726430258, + "height": 70, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0M", + "roundness": null, + "seed": 2145821824, + "version": 540, + "versionNonce": 1634276480, + "isDeleted": false, + "boundElements": null, + "updated": 1737489597405, + "link": null, + "locked": false, + "text": "Vector\nStore", + "fontSize": 28, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "qsuSWNRWwqSyglkAiT1-U", + "originalText": "Vector Store", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "kDIWGhb8HsDs-LQfJ2uX8", + "type": "arrow", + "x": 495.3854167414968, + "y": 1441.0104431156992, + "width": 241, + "height": 0.5497133963660872, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0N", + "roundness": { + "type": 2 + }, + "seed": 240993152, + "version": 1140, + "versionNonce": 61518976, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "33u_O4h2SWkwH0JuLdiMl" + } + ], + "updated": 1737489701587, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + -241, + 0.5497133963660872 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "rWUWmoYfLLz-IfoYFppk9", + "focus": 0.3694310406081393, + "gap": 2, + "fixedPoint": null + }, + "endBinding": { + "elementId": "qsuSWNRWwqSyglkAiT1-U", + "focus": -0.15311908151398868, + "gap": 2, + "fixedPoint": null + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "33u_O4h2SWkwH0JuLdiMl", + "type": "text", + "x": 378.93941657873637, + "y": 1430.6098580878147, + "width": 19.892000325520833, + "height": 35, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0NV", + "roundness": null, + "seed": 103445632, + "version": 9, + "versionNonce": 2105541760, + "isDeleted": false, + "boundElements": null, + "updated": 1737489645739, + "link": null, + "locked": false, + "text": "2", + "fontSize": 28, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "kDIWGhb8HsDs-LQfJ2uX8", + "originalText": "2", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "-Gu6VFXUlObbPjlQhBQ_F", + "type": "text", + "x": 250.90028159503646, + "y": 1306.232436787928, + "width": 294.50360362625395, + "height": 82.50381277938487, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0O", + "roundness": null, + "seed": 1849844608, + "version": 1596, + "versionNonce": 1248584576, + "isDeleted": false, + "boundElements": [], + "updated": 1737497077087, + "link": null, + "locked": false, + "text": "Get context for question:\nFind data chunks that are\nmost similar to question", + "fontSize": 22.0010167411693, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Get context for question: Find data chunks that are most similar to question", + "autoResize": false, + "lineHeight": 1.25 + }, + { + "id": "lh9O6aJ6Au8J3uoJtj-1u", + "type": "arrow", + "x": 255.38541674149678, + "y": 1483.743563577265, + "width": 241, + "height": 0.7331204615657043, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0Q", + "roundness": { + "type": 2 + }, + "seed": 193094528, + "version": 851, + "versionNonce": 786928768, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "3Gfm1Lvxn8BIWJGQEUDtK" + } + ], + "updated": 1737489703595, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 241, + -0.7331204615657043 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "qsuSWNRWwqSyglkAiT1-U", + "focus": 0.4754557956867315, + "gap": 3, + "fixedPoint": null + }, + "endBinding": { + "elementId": "rWUWmoYfLLz-IfoYFppk9", + "focus": -0.3451641953790466, + "gap": 1, + "fixedPoint": null + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "3Gfm1Lvxn8BIWJGQEUDtK", + "type": "text", + "x": 369.87708353532, + "y": 1434.3051396781834, + "width": 17.016666412353516, + "height": 35, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0QV", + "roundness": null, + "seed": 1575556224, + "version": 3, + "versionNonce": 408642688, + "isDeleted": false, + "boundElements": null, + "updated": 1737489170297, + "link": null, + "locked": false, + "text": "3", + "fontSize": 28, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "lh9O6aJ6Au8J3uoJtj-1u", + "originalText": "3", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "fawA_cGHh8QB7OA4lgEzM", + "type": "arrow", + "x": 646.3854167414968, + "y": 1439.0104431156992, + "width": 229.69662303381665, + "height": 3.5694345858785255, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0S", + "roundness": { + "type": 2 + }, + "seed": 434078592, + "version": 1964, + "versionNonce": 409760896, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "GWq7yBytTyP5gAB-cPlf8" + } + ], + "updated": 1737489705880, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 229.69662303381665, + -3.5694345858785255 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "rWUWmoYfLLz-IfoYFppk9", + "focus": -0.3745079255908868, + "gap": 1, + "fixedPoint": null + }, + "endBinding": { + "elementId": "hgJc4rbHtodgVuvQuEZ0i", + "focus": 0.25290074048226757, + "gap": 2.8033769661833503, + "fixedPoint": null + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "GWq7yBytTyP5gAB-cPlf8", + "type": "text", + "x": 761.0420616553167, + "y": 1403.22572582276, + "width": 16.383333206176758, + "height": 35, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0SV", + "roundness": null, + "seed": 1043313792, + "version": 4, + "versionNonce": 301814656, + "isDeleted": false, + "boundElements": null, + "updated": 1737489296382, + "link": null, + "locked": false, + "text": "4", + "fontSize": 28, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "fawA_cGHh8QB7OA4lgEzM", + "originalText": "4", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "36kJd_KG7LuD9OdQsrh2Q", + "type": "text", + "x": 301.08541674149683, + "y": 1504.4620560189253, + "width": 184.60000000000002, + "height": 59.54838709677426, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0T", + "roundness": null, + "seed": 429413248, + "version": 1001, + "versionNonce": 1376387968, + "isDeleted": false, + "boundElements": [], + "updated": 1737489183639, + "link": null, + "locked": false, + "text": "Context (e.g.\nTop-2 chunks)", + "fontSize": 23.819354838709707, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Context (e.g. Top-2 chunks)", + "autoResize": false, + "lineHeight": 1.25 + }, + { + "id": "luF99D7lkL6zH62Ido8hm", + "type": "text", + "x": 611.1729252335374, + "y": 1302.6221733895152, + "width": 368.20338023121906, + "height": 81.89298135475389, + "angle": 0.0014961378798030722, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0U", + "roundness": null, + "seed": 1190105984, + "version": 2172, + "versionNonce": 161358720, + "isDeleted": false, + "boundElements": [ + { + "id": "Z-RaBCcdD6LWm3f2x-rZh", + "type": "arrow" + } + ], + "updated": 1737497096780, + "link": null, + "locked": false, + "text": "Send question + context + prompt\nasking LLM to answer question\nusing only provided context", + "fontSize": 21.838128361267703, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Send question + context + prompt asking LLM to answer question using only provided context", + "autoResize": false, + "lineHeight": 1.25 + }, + { + "id": "Z-RaBCcdD6LWm3f2x-rZh", + "type": "arrow", + "x": 876.4441901425796, + "y": 1476.0816086961695, + "width": 227.05877340108282, + "height": 3.9288344195297213, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0V", + "roundness": { + "type": 2 + }, + "seed": 469132416, + "version": 1728, + "versionNonce": 275784576, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "RY0tqrJlJpa4rtgGz0Utg" + } + ], + "updated": 1737489697459, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + -227.05877340108282, + 3.9288344195297213 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "hgJc4rbHtodgVuvQuEZ0i", + "focus": -0.35281631769609584, + "gap": 2.441226598917183, + "fixedPoint": null + }, + "endBinding": { + "elementId": "rWUWmoYfLLz-IfoYFppk9", + "focus": 0.31512432076058794, + "gap": 3, + "fixedPoint": null + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "RY0tqrJlJpa4rtgGz0Utg", + "type": "text", + "x": 757.7984876339809, + "y": 1459.2132524549334, + "width": 17.512000528971353, + "height": 35, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0VV", + "roundness": null, + "seed": 553538688, + "version": 4, + "versionNonce": 366191744, + "isDeleted": false, + "boundElements": null, + "updated": 1737489296382, + "link": null, + "locked": false, + "text": "5", + "fontSize": 28, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "Z-RaBCcdD6LWm3f2x-rZh", + "originalText": "5", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "135Dh-jayw6rm54gXb9SZ", + "type": "text", + "x": 709.6854167414972, + "y": 1499.2362495673121, + "width": 196.9999999999993, + "height": 31.774193548386997, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0W", + "roundness": null, + "seed": 1074308224, + "version": 1077, + "versionNonce": 1783280512, + "isDeleted": false, + "boundElements": [ + { + "id": "Z-RaBCcdD6LWm3f2x-rZh", + "type": "arrow" + } + ], + "updated": 1737489669116, + "link": null, + "locked": false, + "text": "Answer", + "fontSize": 25.419354838709634, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Answer", + "autoResize": false, + "lineHeight": 1.25 + }, + { + "id": "2Z3sghzybH_XPDwQUq20R", + "type": "arrow", + "x": 550.3854167414968, + "y": 1692.0104431156992, + "width": 0, + "height": 169, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0X", + "roundness": { + "type": 2 + }, + "seed": 825877632, + "version": 146, + "versionNonce": 61105280, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "uXO5Eb3XSjTL-lvlcKzL-" + } + ], + "updated": 1737489798719, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + -169 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "sozWfO1Cl02Q7oTG5RNWh", + "focus": -0.20234110550103265, + "gap": 1, + "fixedPoint": null + }, + "endBinding": { + "elementId": "rWUWmoYfLLz-IfoYFppk9", + "focus": 0.28859060402684567, + "gap": 2, + "fixedPoint": null + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "uXO5Eb3XSjTL-lvlcKzL-", + "type": "text", + "x": 544.4104168368642, + "y": 1581.0104431156992, + "width": 11.949999809265137, + "height": 35, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0XV", + "roundness": null, + "seed": 1022612608, + "version": 3, + "versionNonce": 173941888, + "isDeleted": false, + "boundElements": null, + "updated": 1737489165363, + "link": null, + "locked": false, + "text": "1", + "fontSize": 28, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "2Z3sghzybH_XPDwQUq20R", + "originalText": "1", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "_AKGA6fa9DRXWg43H0wKz", + "type": "arrow", + "x": 592.3854167414968, + "y": 1530.0104431156992, + "width": 2.112676056338046, + "height": 164, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0Y", + "roundness": { + "type": 2 + }, + "seed": 100889728, + "version": 193, + "versionNonce": 679642240, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "_k7OATMqHa85H_RrphtYJ" + } + ], + "updated": 1737489800886, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 2.112676056338046, + 164 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "rWUWmoYfLLz-IfoYFppk9", + "focus": -0.2608573094756878, + "gap": 9, + "fixedPoint": null + }, + "endBinding": { + "elementId": "sozWfO1Cl02Q7oTG5RNWh", + "focus": 0.15948855694187153, + "gap": 1, + "fixedPoint": null + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "_k7OATMqHa85H_RrphtYJ", + "type": "text", + "x": 584.4834217542239, + "y": 1584.5104431156992, + "width": 17.91666603088379, + "height": 35, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0YV", + "roundness": null, + "seed": 5567616, + "version": 3, + "versionNonce": 802374784, + "isDeleted": false, + "boundElements": null, + "updated": 1737489177553, + "link": null, + "locked": false, + "text": "6", + "fontSize": 28, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "_AKGA6fa9DRXWg43H0wKz", + "originalText": "6", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "Y56YdbSZHdRKK9XWeLXO7", + "type": "text", + "x": 609.3854167414968, + "y": 1628.5104431156992, + "width": 127.99999999999994, + "height": 34.99999999999997, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0Z", + "roundness": null, + "seed": 1239827328, + "version": 379, + "versionNonce": 1457402752, + "isDeleted": false, + "boundElements": [], + "updated": 1737489130080, + "link": null, + "locked": false, + "text": "Answer", + "fontSize": 27.99999999999998, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Answer", + "autoResize": false, + "lineHeight": 1.25 + } + ], + "appState": { + "gridSize": 20, + "gridStep": 5, + "gridModeEnabled": false, + "viewBackgroundColor": "#ffffff" + }, + "files": { + "dc64b0bc089c8f4c09cdb44d37f60a3d2a4c585dd71cd35ef458bbb2bb5bf63645c1db8bb06830860ea84eff4450b570": { + "mimeType": "image/png", + "id": "dc64b0bc089c8f4c09cdb44d37f60a3d2a4c585dd71cd35ef458bbb2bb5bf63645c1db8bb06830860ea84eff4450b570", + "dataURL": "", + "created": 1710768417758, + "lastRetrieved": 1737485739181 + } + } +} \ No newline at end of file diff --git a/docs/diagrams/elotl_genai_stack_enduser.png b/docs/diagrams/elotl_genai_stack_enduser.png new file mode 100644 index 0000000..37db3ce Binary files /dev/null and b/docs/diagrams/elotl_genai_stack_enduser.png differ diff --git a/docs/install.md b/docs/install.md new file mode 100644 index 0000000..eddf3ea --- /dev/null +++ b/docs/install.md @@ -0,0 +1,531 @@ +# LLM Chat in a Box POC, v0.1.6, 01/08/25 + +[K8s](https://kubernetes.io/) + [Luna](https://docs.elotl.co/luna/intro/) + [KubeRay](https://docs.ray.io/en/master/cluster/kubernetes/getting-started.html) + [RayService](https://docs.ray.io/en/master/cluster/kubernetes/getting-started/rayservice-quick-start.html) + [vLLM](https://docs.vllm.ai/en/stable/) + Open Source LLM + [RayAutoscaler](https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/configuring-autoscaling.html) + [Retrieval Augmented Generation using FAISS](https://python.langchain.com/docs/integrations/vectorstores/faiss/) + +## Overview +This POC is intended to allow you to easily deploy and use a working state-of-the-art chat serving platform for an open-source LLM model via automatically scaling your EKS, GKE, or AKS cloud Kubernetes cluster with economical compute instances. And to easily tear down the deployed serving platform when desired. + +## Cluster Setup Summary + +Run w/Luna on K8s w/L4 (EKS,GKE) & A10 (AKS) w/GPU quota + specified Nvidia GPU drivers + +* Luna-1.2.8, EKS, us-west-2, K8s v1.30.2, w/K8s Nvidia daemonset from +[https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.0/deployments/static/nvidia-device-plugin.yml](https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.0/deployments/static/nvidia-device-plugin.yml +) +* Luna-1.2.9, GKE, us-central1, K8s v1.30.5, w/GCP Nvidia daemonset from +[https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml](https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml) +* Luna-1.2.10, AKS, east-us, K8s v1.31.2, w/K8s Nvidia gpu-operator from +```sh +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update +helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator +``` + +## Install Infrastructure Tools + +### Install Luna Cluster Autoscaler on Cloud K8s Cluster + +On existing cloud K8s cluster, install Luna as per cloud K8s in the [Luna docs](https://docs.elotl.co/luna/intro/). +[Download Free trial here](https://www.elotl.co/luna-free-trial.html). + +Please note that running the POC on EKS requires +Luna to be configured to allocate a larger EBS size than the default; this configuration is described in the EKS section below. +And the EKS, GKE, and AKS sections below provide instructions to optionally reduce node startup time on those platforms. + +#### EKS + +For EKS, you need to specify a larger EBS size w/Luna aws.blockDeviceMapping option. And you have the choice of configuring Luna +to use conventional node images (default) or Bottlerocket node images. Bottlerocket images can be configured for faster node startup time. + +##### Conventional Node Images + +To specify a larger EBS size for an EKS cluster on which Luna is configured to use conventional node images (default), +download [block_device_mapping.json](https://raw.githubusercontent.com/elotl/GenAI-infra-stack/refs/heads/main/demo/llm.gpu.service/block_device_mapping.json) +and when deploying Luna, include ```--additional-helm-values``` set to: +``` +--set-file aws.blockDeviceMappings=/block_device_mapping.json +``` + +##### Bottlerocket Node Images + +When Luna uses conventional node images, downloading the ray-ml image introduces substantial startup time. +Configuring Luna to instead use Bottlerocket node images with a snapshot volume that prepopulates the nodes +Luna allocates with the ray-ml image avoids the ray-ml image download time. +Run [get-user-data.sh](https://github.com/elotl/GenAI-infra-stack/blob/main/demo/llm.gpu.service/get-user-data.sh) with your cluster name and region to produce user-data.toml. +Download [block_device_mapping_bottlerocket.json](https://github.com/elotl/GenAI-infra-stack/blob/main/demo/llm.gpu.service/block_device_mapping_bottlerocket.json), +which references the snapshot snap-09946d545033d96f7 in us-west-2, built using the instructions +in https://github.com/aws-samples/bottlerocket-images-cache?tab=readme-ov-file#build-ebs-snapshot-with-cached-container-image; +replace with your image name. When deploying Luna, include ```--additional-helm-values``` set to: +``` +--set aws.isBottlerocketImage=true +--set aws.imageSsmQueryGeneric=/aws/service/bottlerocket/aws-k8s-%s/x86_64/latest/image_id +--set aws.imageSsmQueryGenericArm=/aws/service/bottlerocket/aws-k8s-%s/arm64/latest/image_id +--set aws.imageSsmQueryGpu=/aws/service/bottlerocket/aws-k8s-%s-nvidia/x86_64/latest/image_id +--set-file aws.blockDeviceMappings=/block_device_mapping_bottlerocket.json +--set-file aws.userData=/user-data.toml +``` +Change the images used by the Ray LLM head and workers in the yaml used in the ray-service installation step below +from rayproject/ray-ml:2.33.0.914af0-py311 to your image (in our case, it is +689494258501.dkr.ecr.us-west-2.amazonaws.com/qa-in-a-box:ray-ml-2.33.0-py311-vllm-0.5.4-hfxfr), +remove the line `pip: ["vllm==0.5.4"]`, and add the following lines below the ray-head and ray-worker image lines to speed up model download: +``` +env: + - name: HF_HUB_ENABLE_HF_TRANSFER + value: "1" + - name: HF_HUB_DISABLE_PROGRESS_BARS + value: "1" +``` +An example of this for the ```microsoft/Phi-3-mini-4k-instruct``` model is [here](https://github.com/elotl/skyray/blob/main/luna-llm-serve/ray-service.llm.Phi-3-mini-4k-instruct.fastereks.yaml) + +#### GKE + +For GKE, you can improve the ray-ml image load time by using Image Streaming from the Artifact Registry. To do so, enable the "Image streaming" feature +on your cluster. When deploying Luna, set gcp.nodeServiceAccount to the Luna service account that includes the `artifactregistry.reader` role via +inclusion in ```--additional-helm-values``` as: +``` +--set gcp.nodeServiceAccount=-elotl@.iam.gserviceaccount.com +``` +Change the images used by the Ray LLM head and workers in the yaml used in the ray-service installation step below +from rayproject/ray-ml:2.33.0.914af0-py311 to your image (in our case, it is gcr.io/elotl-dev/rayproject/ray-ml:2.33.0.914af0-py311-vllm-0.5.4-hfxfr), +remove the line `pip: ["vllm==0.5.4"]`, and add the following lines below the ray-head and ray-worker image lines +to speed up model download: +``` +env: + - name: HF_HUB_ENABLE_HF_TRANSFER + value: "1" +``` +An example of this for the ```microsoft/Phi-3-mini-4k-instruct``` model is [here](https://github.com/elotl/skyray/blob/main/luna-llm-serve/ray-service.llm.Phi-3-mini-4k-instruct.fastergke.yaml) + +#### AKS + +For AKS, you can improve the ray-ml image load time by using the Artifact Streaming on AKS preview feature on your cluster. To do so, +follow the instructions [here](https://learn.microsoft.com/en-us/azure/aks/artifact-streaming) to register the ArtifactStreamingPreview +feature in your subscription and to enable Artifact Streaming on your ACR image(s). When deploying Luna, set ```azure.enableArtifactStreaming``` to true +in your ```--additional-helm-values``` parameter. + +Change the images used by the Ray LLM head and workers in the yaml used in the ray-service installation step below +from rayproject/ray-ml:2.33.0.914af0-py311 to your image (in our case, it is elotleastus.azurecr.io/rayproject/ray-ml:2.33.0.914af0-py311-vllm-0.5.4-hfxfr), +remove the line `pip: ["vllm==0.5.4"]`, and add the following lines below the ray-head and ray-worker image lines +to speed up model download: +``` +env: + - name: HF_HUB_ENABLE_HF_TRANSFER + value: "1" + - name: HF_HUB_DISABLE_PROGRESS_BARS + value: "1" +``` +An example of this for the ```microsoft/Phi-3-mini-4k-instruct``` model is [here](https://github.com/elotl/skyray/blob/main/luna-llm-serve/ray-service.llm.Phi-3-mini-4k-instruct.fasteraks.yaml) + +### Install KubeRay Operator to manage Ray on Cloud K8s Cluster +```sh +helm install kuberay-operator kuberay/kuberay-operator --version 1.1.0-rc.0 +``` +```sh +# Small resource footprint; Installed on static, i.e., non-Luna-allocated resources +``` + +## Install Model Serve Stack +You can choose to install the RayService w/vLLM + Open Source Model Serve Stack either without or with the Ray Autoscaler, as described in the 2 subsections for each of the two models below. If you install it w/o the Ray Autoscaler, the model serve stack will come up more quickly, but will have a fixed number of workers, configured as 1. If you install it with the Ray Autoscaler, the model serve stack will start with 0 workers, will scale to 1 worker as the RayService is activated, and will scale to more workers as needed to handle the query load, configured w/a max of 4. + +The instructions below describe installing a MosaicML model and a Microsoft model. +Note that the Microsoft model is of a more recent vintage and loads faster; hence, +that may be the better choice of the two. + +### [MosaicML Open Source Model](https://huggingface.co/mosaicml/mpt-7b-chat) +Install RayService w/vLLM + MosaicML OS Model w/o Ray Autoscaler + +```sh +kubectl apply -f ray-service.llm.yaml +``` +```sh +# From https://github.com/elotl/skyray/blob/main/luna-llm-serve/ray-service.llm.yaml +# Large resource footprint; Installed on Luna-allocated resources +# Takes 10-15m: add nodes + large image + vLLM update + Ray setup + model download +# Wait for svc/llm-model-serve-serve-svc to be available [This is the last of the 3 services started] +``` + +Install RayService w/vLLM + MosaicML OS Model w/ Ray Autoscaler + +```sh +kubectl apply -f ray-service.llm.autoscale.yaml +``` +```sh +# From https://github.com/elotl/skyray/blob/main/luna-llm-serve/ray-service.llm.autoscale.yaml +# Large resource footprint; Installed on Luna-allocated resources +# Takes 10-15m: add nodes + large image + vLLM update + Ray setup + model download +# Wait for svc/llm-model-serve-serve-svc to be available [This is the last of the 3 services started] +``` + +### [Microsoft Open Source Model](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) +Install RayService w/vLLM + Microsoft OS Model w/o Ray Autoscaler + +```sh +kubectl apply -f ray-service.llm.Phi-3-mini-4k-instruct.yaml +``` +```sh +# From https://github.com/elotl/skyray/blob/main/luna-llm-serve/ray-service.llm.Phi-3-mini-4k-instruct.yaml +# Large resource footprint; Installed on Luna-allocated resources +# Takes 10-15m: add nodes + large image + vLLM update + Ray setup + model download +# Wait for svc/llm-model-serve-serve-svc to be available [This is the last of the 3 services started] +``` + +Install RayService w/vLLM + Microsoft OS Model w/ Ray Autoscaler + +```sh +kubectl apply -f ray-service.llm.Phi-3-mini-4k-instruct.autoscale.yaml +``` +```sh +# From https://github.com/elotl/skyray/blob/main/luna-llm-serve/ray-service.llm.Phi-3-mini-4k-instruct.autoscale.yaml +# Large resource footprint; Installed on Luna-allocated resources +# Takes 10-15m: add nodes + large image + vLLM update + Ray setup + model download +# Wait for svc/llm-model-serve-serve-svc to be available [This is the last of the 3 services started] +``` + +## Model Serve + +### Run Port-forward for Model Endpoint (in the background) +```sh +kubectl port-forward svc/llm-model-serve-serve-svc 8000:8000 +``` + +### Query Model Endpoint + +MosaicML Open Source Model + +```sh +python query.py +``` +```sh +# From https://github.com/elotl/skyray/blob/main/luna-llm-serve/query.py +# Run in separate terminal window from port-forward command +# Requires “pip install openai” +# Prompts user for query +``` + +Microsoft Open Source Model + +```sh +python query.py +``` +```sh +# From https://github.com/elotl/skyray/blob/main/luna-llm-serve/query.py +# Run in separate terminal window from port-forward command +# Requires “pip install openai” +# Requires “export MODEL_ID="microsoft/Phi-3-mini-4k-instruct" +# Prompts user for query +``` + +### Experiment with Scaling + +If you installed the model serve stack with the Ray Autoscaler, you can experiment with scaling by presenting sufficient query load. Running the following shell script presented enough load for the Ray Autoscaler to increase the number of workers from 1 to 2, for which the Luna Autoscaler added an additional GPU node to the cluster. + +```sh +./loadtest.sh +``` +```sh +# From https://github.com/elotl/skyray/blob/main/luna-llm-serve/loadtest.sh +# Run in clone of repo with question.txt available in the same directory +# Use control-C to stop the script +``` + +After the load script was stopped and a period of time elapsed, the Ray Autoscaler reduced the worker count back to 1, and the Luna Autoscaler removed the additional node from the cluster. + +## Retrieval Augmented Generation (RAG) using FAISS + +In this section, we illustrate how LLM chat can be extended to work with custom datasets using the technique of Retrieval Augmented Generation. If you do not want to incorporate your custom datasets during the LLM chat you can skip this section. + +In order to use RAG, please follow the instructions in all of these prior sections before you follow the instructions in this section: +* Cluster Setup Summary +* Install Infrastructure Tools +* Install Luna Cluster Autoscaler on Cloud K8s Cluster +* Install KubeRay Operator to manage Ray on Cloud K8s Cluster +* Install Model Serve Stack +* Model Serve + +In this section, we provide an example of storing your RAG dataset and the resulting Vector Store on AWS-specific S3 storage. + +### Setup RAG input dataset + +In order to create the RAG dataset, we will run a Kubernetes job that will retrieve text documents from an S3 bucket, convert each text file into a vector embedding and save these embeddings in a Vector store. For the purpose of this POC, we use FAISS (Facebook’s Similarity Search) library to create both the embeddings and the VectorStore file. Please follow the instructions below setup the RAG dataset as well as the configure the parameters needed to run the vector Store creation Kubernetes job. + + +1. Create an S3 bucket and a folder (prefix) within it. Upload all the text documents that you would like to use as your RAG dataset into this folder. +Use can use the instructions here to create an S3 bucket: [Creating a S3 bucket in AWS](https://docs.aws.amazon.com/AmazonS3/latest/userguide/GetStartedWithS3.html#creating-bucket) and the instructions here to create a folder within this bucket: +[Folder creation](https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html#create-folder) + +1. Create a local file with these environment variables exported with suitable values: +AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY: These AWS access credentials should have permissions to read from and write to the S3 bucket created in the previous step. +If you would like to use your AWS CLI setup locally, you can use these commands to create these environment variables. Please note that these access credentials will not be limited to the minimal S3 bucket read and write permissions that are needed for setting up RAG. It is only provided here for ease of use (and is not meant for a production use). +```sh +export AWS_ACCESS_KEY_ID=$(grep aws_access_key_id ~/.aws/credentials | awk '{print $3}') +export AWS_SECRET_ACCESS_KEY=$(grep aws_secret_access_key ~/.aws/credentials | awk '{print $3}') +``` + +1. VECTOR_DB_INPUT_TYPE: Set this to a value of “text-docs" if the RAG dataset will be a set of documents in text format. Alternatively, you can set it to a value of “sitemap”, if the RAG dataset will need to be built from documents hosted on a website. For e.g: +```sh +export VECTOR_DB_INPUT_TYPE=text-docs +``` + +1. VECTOR_DB_INPUT_ARG: +If the VECTOR_DB_INPUT_TYPE value is “text-docs”, then this env variable will be set to the value of the folder or prefix name within an S3 bucket where the text documents will be uploaded by the customer. +If the VECTOR_DB_INPUT_TYPE value is “sitemap”, then this env variable will be set to the URL value of sitemap of a website whose pages will be used as the RAG dataset. + +1. VECTOR_DB_S3_BUCKET: Name of the S3 bucket that will contain the input dataset to be used for the RAG as well as RAG vector datastore. Please note that when VECTOR_DB_INPUT_TYPE value is “sitemap”, there is no input dataset that is needed to be uploaded to the S3 bucket. This is because the sitemap URL will be parsed by the Kto retrieve the dataset. + +1. VECTOR_DB_S3_FILE: Name of the vector DB file that will be created by Elotl and saved in the provided S3 bucket. + +1. MODEL_ID = [ microsoft/Phi-3-mini-4k-instruct | mosaicml/mpt-7b-chat ] Select the LLM model that is to be used. You can read about these two models here: +[https://huggingface.co/microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) +[https://huggingface.co/mosaicml/mpt-7b-chat](https://huggingface.co/mosaicml/mpt-7b-chat) + +1. MODEL_LLM_SERVER_URL: Set this env var to the value http://llm-model-serve-serve-svc..svc.cluster.local:8000 +Please replace with “default” if you plan to follow the remaining instructions in this RAG section without any changes. In case you will be deploying the RAG LLM service in a custom namespace, then please replace with the name of the custom namespace. + +Source all these environment variables to your local shell. + +Note: If you chose to work with “text documents” for your RAG dataset, your documents would be made available at this S3 URI: +s3://// + +We will now setup some environment variables that are needed to enable us to customize how the Vector Store creation and RAG querying is performed. + +```sh +# LLM querying configurable parameters: + MODEL_ID (DEFAULT=mosaicml/mpt-7b-chat) + RELEVANT_DOCS (DEFAULT = 2) + MAX_TOKENS (DEFAULT=128) + MODEL_TEMPERATURE (DEFAULT=0.01) +``` + +```sh +# Vector Store creation' configurable parameters: + EMBEDDING_CHUNK_SIZE (DEFAULT=1000) + EMBEDDING_CHUNK_OVERLAP (DEFAULT=100) + EMBEDDING_MODEL_NAME (DEFAULT=sentence-transformers/all-MiniLM-L6-v2) +``` + +If You decide to pass to vector db creation a file created using our process_jira_tickets.py script set the following to "True". +```sh + IS_JSON_MODE (DEFAULT="False") +``` + +### Sample RAG Dataset +As an example of a RAG dataset, you could use this subset of Wikipedia docs: [https://huggingface.co/datasets/rag-datasets/rag-mini-wikipedia](https://huggingface.co/datasets/rag-datasets/rag-mini-wikipedia) +This dataset is accompanied with a number of Questions and Answers that can be used to validate RAG functionality. + +You can use this command to download this dataset: +```sh +git clone https://huggingface.co/datasets/rag-datasets/rag-mini-wikipedia +``` + + +## Creation of the Vector store +The vector Store can be created in your S3 bucket by running this Kubernetes job on your cluster. For the purpose of this POC, the default namespace is being used. Alternate namespaces can be used if desired. You can download createvdb.yaml from here: +[https://github.com/elotl/GenAI-infra-stack/blob/main/demo/llm.vdb.service/createvdb.yaml](https://github.com/elotl/GenAI-infra-stack/blob/main/demo/llm.vdb.service/createvdb.yaml) + +```sh +envsubst < createvdb.yaml | kubectl apply -f - +``` + +Ensure that the k8s job and corresponding pod are running as expected: + +```sh +% kubectl get jobs +NAME STATUS COMPLETIONS DURATION AGE +createvectordb Running 0/1 5s 5s +``` + +```sh +% kubectl get pods +NAME READY STATUS RESTARTS AGE +createvectordb-kzrw6 1/1 Running 0 118s +``` + +This will take a few minutes to complete. The logs in the above pod will end with these messages. + +```sh +...SNIP... +Downloaded file, mini-rag-wikipedia-input/S10_set6_a9.txt.clean successfully to directory, /tmp/selvi-s3-rag-wikipedia +Downloaded file, mini-rag-wikipedia-input/S10_set6_topics.txt successfully to directory, /tmp/selvi-s3-rag-wikipedia +Number of files downloaded is 165, local tmp dir is /tmp/selvi-s3-rag-wikipedia +Number of documents loaded via DirectoryLoader is 165 +Uploaded vectordb to selvi-faiss-vectordbs selvi-s3-rag-wikipedia +``` + +After the job completes, please ensure that the Vector Store file has been created in your S3 bucket. Here is a screenshot of the Vector Store file for the mini RAG dataset: + +You can use this AWS cli command to verify that it was created correctly: +```sh +% aws s3 ls $VECTOR_DB_S3_BUCKET/$VECTOR_DB_S3_FILE +``` +```sh +2024-10-30 12:52:52 104804503 selvi-s3-rag-wikipedia +``` + +## Setup RAG + LLM service + +We will now create a Kubernetes Deployment and a Service that will take in the user’s question, interact with the Vector Store to find relevant documents and then query our hosted LLM service to provide an answer. You can download the manifest rag-chat-serveragllm.yaml from here: [rag-chat-serveragllmpluslb.yaml](https://github.com/elotl/GenAI-infra-stack/blob/main/demo/llm.rag.service/rag-chat-serveragllmpluslb.yaml) + +```sh +envsubst < rag-chat-serveragllmpluslb.yaml | kubectl apply -f - +``` + +Please wait for the deployment and Kubernetes LoadBalancer service to become ready and to also obtain an external IP. This can take a few minutes. The command outputs below specifically show the deployment, pod and services associated with the RAQ LLM service. + +```sh +# View deployments + +% kubectl get deploy +NAME READY UP-TO-DATE AVAILABLE AGE +serveragllm-deployment 1/1 1 1 2m12s +... + +# View pods +% kubectl get pods +NAME READY STATUS RESTARTS AGE +serveragllm-deployment-7bcd47c9dc-nqs2s 1/1 Running 0 2m15s + +# View services +% kubectl get svc +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +serveragllm-service LoadBalancer 10.100.211.63 80:32581/TCP 2m19s +``` + +Please note the IP listed in the EXTERNAL-IP column shown in the output of the `kubectl get svc` command above. + + +## Query the LLM with RAG +You can use the provided scripts/query/rag_query.py script with port forwarding to query the RAG+LLM service endpoint and ask questions about your RAG dataset. + +### Run Port-forward for Model Endpoint (in the background) +```sh +nohup kubectl port-forward svc/serveragllm-service 8000:8000 & +``` + +Use the script to query the RAG+LLM service + +```sh +python rag_query.py +``` +``` +Type your query here: What are the two types of elephants in Africa? +Answer: The two types of elephants in Africa are the African bush elephant (Loxodonta africana) and the African forest elephant (Loxodonta cyclotis). +``` + +## Query the LLM with RAG using a Chat UI + +### Generate and setup a password for your Chat UI + +Run the tool `htpasswd` locally to generate credentials needed for authentication + +```sh +htpasswd -c .htpasswd +``` + +Convert the encrypted password to base64 encoding to be used in a K8s secret: + +```sh +cat .htpasswd | base64 +``` + +Paste the output of the prior command inside the Secret in this manifest: `demo/llm.chatui.service/auth-proxy.yml` + +```sh +--- +# auth-proxy.yaml +apiVersion: v1 +kind: Secret +metadata: + name: auth-proxy-credentials +type: Opaque +data: + # Generated using: htpasswd -c .htpasswd username + # Then base64 encode the file content + # htpasswd -c .htpasswd your_chosen_username + # cat .htpasswd | base64 + # myuser:elotl + + .htpasswd: +--- +``` + + +### Install the authentication proxy for the Chat UI + +```sh +kubectl apply -f demo/llm.chatui.service/auth-proxy.yml +``` + +### Install Chat UI app + +```sh +kubectl apply -f demo/llm.chatui.service/simple-chat.yaml +``` + +Wait for an external IP to be associated with the `auth-proxy-service`. You can now access this external IP type from a browser. You will be asked to enter a username and password before viewing your Chat UI. + +```sh +% kubectl get svc auth-proxy-service +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +auth-proxy-service LoadBalancer 10.100.230.224 80:32497/TCP 85m +``` + +# Uninstall + +## 1. Uninstall Model Serve Stack + +### A. [MosaicML Open Source Model](https://huggingface.co/mosaicml/mpt-7b-chat) + +Uninstall RayService w/vLLM + MosaicML OS Model w/o Ray Autoscaler + +```sh +kubectl delete -f ray-service.llm.yaml +``` +```sh +# From https://github.com/elotl/skyray/blob/main/luna-llm-serve/ray-service.llm.yaml +# After around 5m, Luna will scale down the nodes that were allocated for the RayService +``` + +Uninstall RayService w/vLLM + MosaicML OS Model w/ Ray Autoscaler + +```sh +kubectl delete -f ray-service.llm.autoscale.yaml +``` +```sh +# From https://github.com/elotl/skyray/blob/main/luna-llm-serve/ray-service.llm.autoscale.yaml +# After around 5m, Luna will scale down the nodes that were allocated for the RayService +``` + +### B. [Microsoft Open Source Model](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) + +Uninstall RayService w/vLLM + Microsoft OS Model w/o Ray Autoscaler + +```sh +kubectl delete -f ray-service.llm.Phi-3-mini-4k-instruct.yaml +``` +```sh +# From https://github.com/elotl/skyray/blob/main/luna-llm-serve/ray-service.llm.Phi-3-mini-4k-instruct.yaml +# After around 5m, Luna will scale down the nodes that were allocated for the RayService +``` + +## 2. Uninstall RayService w/vLLM + Microsoft OS Model w/ Ray Autoscaler +```sh +kubectl delete -f ray-service.llm.Phi-3-mini-4k-instruct.autoscale.yaml +``` +```sh +# From https://github.com/elotl/skyray/blob/main/luna-llm-serve/ray-service.llm.Phi-3-mini-4k-instruct.autoscale.yaml +# After around 5m, Luna will scale down the nodes that were allocated for the RayService +``` + +## 3. Uninstall Infrastructure Tools + +### Uninstall KubeRay +```sh +helm uninstall kuberay-operator +``` + +### Uninstall Luna + +Uninstall Luna as per Installation/Cleanup for cloud K8s type in the [Luna Docs](https://docs.elotl.co/luna/intro/). + +# Potential Development Areas +* Replace one shot question/answer with interactive chat, also provide browser interface +* Provide straightforward mechanism to update models diff --git a/docs/rag-user-docs.md b/docs/rag-user-docs.md new file mode 100644 index 0000000..d0e0337 --- /dev/null +++ b/docs/rag-user-docs.md @@ -0,0 +1,58 @@ +# End-User Overview of Elotl's Question-Answer ChatBot + +Elotl's Question-Answer ChatBot is powered by these technologies: + +1. [Retrieval Augmented Generation (RAG)](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) +2. Open-Source [Large Language Models](https://en.wikipedia.org/wiki/Large_language_model) +3. [Vector Stores](https://en.wikipedia.org/wiki/Vector_database) + +This is a high-level graphic on how these technologies are used to determine an answer to the end-user's question about a specific knowledge base. + +
+elotl_genai_stack_enduser +
+ +## What types of questions can a RAG-based Chatbot answer well? + +- Question-Answering Chatbots powered by RAG & LLMs are good at answering questions about a small subset of relevant data from the given +Knowledge Base. This is because of how Retrieval Augmented Generation works: User questions are first matched to "chunks" of data from +the user's Knowledge Base, that has been stored in a VectorStore or Vector Database. The user's question, along with these retrieved matching +chunks are sent to the LLM with an appropriate the system prompt. This prompt instructs the LLM to answer the question using the context retrieved +from the specialized knowledge base. + +- Let's take for example, a Question-Answer Chatbot working with an engineering team's JIRA ticket Knowledge base. Here are some sample questions +that can be answered well: + +```sh +What was the issue with the Apache Airflow installation? +What type of security issues have been handled? +Were the SQL issues resolved for the ENG team? +Were any Kubernetes upgrades initiated on the cloud? +Can you give the description of any upgrade requests that have been received? +``` + +## What types of questions is a RAG-based ChatBot not designed to answer well? + +The RAG technique is not intended to answer knowledge aggregation questions about a large amount of data from a Knowledge Base. + +```sh +What is the most frequent Kubernetes issue? +Can you summarize all the recent upgrade tasks? +Can you find the ticket that's been unresolved for the longest time? +``` + +Let's look into one of these questions in detail to understand why the RAG technique and input token limits of LLMs make it difficult for a +RAG-based Question-Answering ChatBot to answer well. + +```sh +What is the most frequent Kubernetes issue? +``` + +A question like this requires access to all Kubernetes-related tickets in the JIRA knowledge base to be able to accurately determine which one +is the most frequent. Depending on the specific dataset used to create the Vector store/DB, the number of Kubernetes related tickets could be a +couple to many hundreds. In the former case, i.e there are only 2 Kubernetes related tickets, then a top-K similarity search during the Vector +store retrieval step would be able to pass all the necessary context to the LLM to answer this question accurately. In the latter case, +i.e. if there are many hundreds of Kubernetes-related tickets, then the RAG service would not be able to pass all of this context to the LLM +to answer the question accurately. The reason for not being able to send a large context is because LLMs are limited by the incoming token length +they are able to process. + diff --git a/docs/sql_setup.md b/docs/sql_setup.md new file mode 100644 index 0000000..3edde53 --- /dev/null +++ b/docs/sql_setup.md @@ -0,0 +1,50 @@ +# Text to SQL Setup + +## Preparation + +1. Convert structured data in CSV format to SQL DB and make it available in an S3 bucket within a prefix folder. + +2. Use synthetic questions in CSV format to train a classification model. The result of +this process are two pkl files. Save these files also in the same S3 bucket and prefix folder +as in Step 1. + + +## Setting up SQL + Vector search locally + +1. Export local variables such as + +- LLM server url at 9000 +- Setup location of SQL DB and question classification models + +```sh +source some_location/exports-local.sh +``` + +1. Run the LLM on k8s and portforward: + +```sh + kubectl port-forward svc/llm-model-serve-serve-svc 9000:8000 +``` + +2. Run the local version of the SQL + hybrid search app: + +```sh +llm.rag.service % source .venv/bin/activate + uv run serverragllm_csv_to_weaviate_local.py +``` + +Wait till applicaiton is loaded and you see this message: +```sh +INFO: Application startup complete. +``` + +3. Try a question about your user data: + +```sh +cd /GenAI-infra-stack/scripts/query +``` + +```sh +% python query_private_data.py +Type your query here: How many tickets are there? +``` diff --git a/docs/weaviate_local_setup.md b/docs/weaviate_local_setup.md new file mode 100644 index 0000000..17319c3 --- /dev/null +++ b/docs/weaviate_local_setup.md @@ -0,0 +1,32 @@ +# Running app locally with weaviate + +## Run weaviate +```shell + docker compose up weaviate +``` + +## Run create database +Create an .my-env file and put inside +```shell +LOCAL_DIRECTORY="../../scripts/zendesk_dataprep_output" +OUTPUT_FILENAME="" + +WEAVIATE_URI_WITH_PORT="localhost:8080" +WEAVIATE_GRPC_URI_WITH_PORT="localhost:50051" +WEAVIATE_INDEX_NAME="my_custom_index" + +EMBEDDING_CHUNK_SIZE=4000 +EMBEDDING_CHUNK_OVERLAP=100 + +EMBEDDING_MODEL_NAME=sentence-transformers/multi-qa-mpnet-base-dot-v1 +``` + +run: +```shell +uv run createvectordb.py --env_file .my-env +``` + +## Run serve rag app +```shell +uv run serverragllm_csv_to_weaviate_local.py +``` \ No newline at end of file diff --git a/question_classification/predict/question_classification.py b/question_classification/predict/question_classification.py new file mode 100644 index 0000000..e7acc72 --- /dev/null +++ b/question_classification/predict/question_classification.py @@ -0,0 +1,33 @@ +import os +import joblib + +def predict_question_type(question, model, tfidf, id_to_category): + + # Transform the input question into TF-IDF feature representation + question_tfidf = tfidf.transform([question]).toarray() + + # Predict the category ID + predicted_category_id = model.predict(question_tfidf)[0] + + # Convert category ID back to label + predicted_category = id_to_category[predicted_category_id] + + return predicted_category + +def load_models(): + # Load the saved model + rf_model_loaded = joblib.load('./models/random_forest_model.pkl') + + # Load the saved TF-IDF vectorizer + tfidf_loaded = joblib.load('./models/tfidf_vectorizer.pkl') + + print("Model and vectorizer loaded successfully!") + return rf_model_loaded, tfidf_loaded + +sample_question = "How many tickets are there?" +#sample_question = "What was the last upgrade issue?" +rf_model_loaded, tfidf_loaded = load_models() +id_to_category = {0: 'aggregation', 1: 'pointed'} +predicted_category = predict_question_type(sample_question, rf_model_loaded, tfidf_loaded, id_to_category) + +print("Testing with a sample question: ", sample_question, "\nPredicted Question Type:", predicted_category) diff --git a/question_classification/predict/requirements.txt b/question_classification/predict/requirements.txt new file mode 100644 index 0000000..d5e0602 --- /dev/null +++ b/question_classification/predict/requirements.txt @@ -0,0 +1 @@ +scikit-learn diff --git a/question_classification/train/question_classification_models.py b/question_classification/train/question_classification_models.py new file mode 100644 index 0000000..4a102ab --- /dev/null +++ b/question_classification/train/question_classification_models.py @@ -0,0 +1,69 @@ +import os +import joblib +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.ensemble import RandomForestClassifier + + +def predict_question_type(question, model, tfidf, id_to_category): + # Transform the input question into TF-IDF feature representation + question_tfidf = tfidf.transform([question]).toarray() + + # Predict the category ID + predicted_category_id = model.predict(question_tfidf)[0] + + # Convert category ID back to label + predicted_category = id_to_category[predicted_category_id] + + return predicted_category + +# Save the trained model +def save_models(rf_model, tfidf): + joblib.dump(rf_model, 'random_forest_model.pkl') + + # Save the TF-IDF vectorizer as well (since it's needed for transforming new data) + joblib.dump(tfidf, 'tfidf_vectorizer.pkl') + + print("Model and vectorizer saved successfully!") + +# loading data, replace with questions specific to your dataset +df = pd.read_csv('syntheticquestions.csv') + +# Create a new dataframe with two columns +df1 = df[['question', 'question_type']].copy() +df1.head(3).T +df2=df1 + +# map categories to numbers +# Create a new column 'category_id' with encoded categories +df2['category_id'] = df2['question_type'].factorize()[0] +category_id_df = df2[['question_type', 'category_id']].drop_duplicates() + +# Dictionaries for future use +category_to_id = dict(category_id_df.values) +id_to_category = dict(category_id_df[['category_id', 'question_type']].values) + +print("ID to category Dict:", id_to_category) + +# New dataframe +df2.head() + +# find features and labels +tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1, + ngram_range=(1, 2), + stop_words='english') + +# transform each question into a vector +features = tfidf.fit_transform(df2.question).toarray() +labels = df2.category_id +print("Each of the %d questions is represented by %d features (TF-IDF score of unigrams and bigrams)" %(features.shape)) + +print("Model training starts...") +rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0) +rf_model.fit(features, labels) + +sample_question = "What was the last upgrade issue?" +predicted_category = predict_question_type(sample_question, rf_model, tfidf, id_to_category) +print("Testing with a sample question: ", sample_question, "\nPredicted Question Type:", predicted_category) + +save_models(rf_model, tfidf) \ No newline at end of file diff --git a/question_classification/train/requirements.txt b/question_classification/train/requirements.txt new file mode 100644 index 0000000..fda4dd6 --- /dev/null +++ b/question_classification/train/requirements.txt @@ -0,0 +1,2 @@ +pandas +scikit-learn diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..addc2ae --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,13 @@ +# Prepare csv dump of Jira tickets adjusted for embedding + +[Install UV](https://docs.astral.sh/uv/getting-started/installation/) than create venv and install deps +```shell +uv venv +source .venv/bin/activate +uv pip install -r requirements.txt +``` +set jira_url in jira_config.ini + +```shell +uv run process_jira_tickets.py jira_elotl.csv jira_config.ini output_files +``` diff --git a/scripts/eval/elotl_jira_questions.txt b/scripts/eval/elotl_jira_questions.txt new file mode 100644 index 0000000..05ff8d1 --- /dev/null +++ b/scripts/eval/elotl_jira_questions.txt @@ -0,0 +1,25 @@ +What is Luna? +What is Nova? +What was the description of the most recent security ticket in Luna? +Was there a Luna to enable sending user-data to an instance at boot time? +What was the description of the most recent DR ticket in Nova? +What cloud providers does Luna work on? +Can you describe any details about the addition of a promtail sidecar to Luna? +What are recent updates to the Nova JIT feature? +What are the two types of elephants in Africa? +Was there a ticket in Luna to enable sending user-data to an instance at boot time? +Was there any work done in Luna to enable sending user-data to an instance at boot time? +Give me the description of a ticket in Luna that deals with sending user-data to an instance at boot time? +Has the Nova JIT feature been implemented? +Can you tell me what the Nova JIT feature is? +Had a promtail sidecar been added to Luna? +What was the most recent type of testing done in Luna? +When was Nova's DR feature implemented? +Was the last ticket related to Nova's DR feature? +Which was the last ticket related to Nova's DR feature? +Is there a ticket about promtail sidecars in Luna? +Can you give me the description of the of a Luna ticket about promtail sidecars? +Can you describe any details about the addition of a promtail sidecar to Luna? +What type of work has been done to test the scalability of Nova? +What type of cloud providers does Luna not work on? +Are there any recent updates to the Nova JIT feature? diff --git a/scripts/eval/mini_rag_questions.txt b/scripts/eval/mini_rag_questions.txt new file mode 100644 index 0000000..89d049e --- /dev/null +++ b/scripts/eval/mini_rag_questions.txt @@ -0,0 +1,50 @@ +Was Abraham Lincoln the sixteenth President of the United States? +Did Lincoln sign the National Banking Act of 1863? +Did his mother die of pneumonia? +How many long was Lincoln's formal education? +When did Lincoln begin his political career? +What did The Legal Tender Act of 1862 establish? +Who suggested Lincoln grow a beard? +When did the Gettysburg address argue that America was born? +Did Lincoln beat John C. Breckinridge in the 1860 election? +Was Abraham Lincoln the first President of the United States? +When did Lincoln first serve as President? +Who assassinated Lincoln? +Did Lincoln win the election of 1860? +Who was the general in charge at the Battle of Antietam? +Why did Lincoln issue the Emancipation Proclamation? +Do scholars rank lincoln among the top three presidents? +Did lincoln have 18 months of schooling? +When was the first photgraph of lincoln taken? +How long was Lincoln's legal Career? +What trail did Lincoln use a Farmers' Almanac in? +Did Abraham Lincoln live in the Frontier? +Did Lincoln's Wife's Family support slavery? +Who is most noted for his contributions to the theory of molarity and molecular weight? +Who graduated in ecclesiastical law at the early age of 20 and began to practice? +Was Lorenzo Romano Amedeo Carlo Avogadro an Italian savant? +Was Amedeo Avogadro born in Turin August 9th 1776 to a noble ancient family of Piedmont, Italy? +What happened in 1833? +Who determined the dependence of the boiling of water with atmospheric pressure? +Is it true that thermometer had 100 for the freezing point? +Was Celsius born in Uppsala in Sweden? +Was Anders Celsius (November 27, 1701 April 25, 1744) a Swedish astronomer? +Is The Celsius crater on the Moon named after him? +Who was the first to perform and publish careful experiments aiming at the definition of an international temperature scale on scientific grounds ? +Can beetles be found in polar regions? +What are the three sections of a beetle? +Which defense mechanism uses colour or shape to deceive potential enemies? +Which type of beetle is a pest of potato plants? +How can beetle larvae be differentiated from other insect larvae? +What do beetles eat? +What are the similarities between beetles and grasshoppers? +How many species of beetles are there? +What is the study of beetles called? +What are prey of various animals including birds and mammals? +What was given by Aristotle for the hardened shield like forewings? +Who or what vary greatly in form within the coleoptera? +When did Coolidge meet and marry Grace Anna Goodhue? +What period of rapid economic growth did the United States experience during Coolidge's presidency? +What did Coolidge do after graduating from Amherst? +When was Coolidge born? +Where did Coolidge's grandfather had government offices? diff --git a/scripts/eval/test_qa.py b/scripts/eval/test_qa.py new file mode 100644 index 0000000..11f5cb9 --- /dev/null +++ b/scripts/eval/test_qa.py @@ -0,0 +1,90 @@ +import requests +import json +from datetime import datetime +import time +import sys +import os +import urllib + +def read_questions(filename): + """Read questions from a text file, one per line.""" + try: + with open(filename, 'r', encoding='utf-8') as file: + return [line.strip() for line in file if line.strip()] + except FileNotFoundError: + print(f"Error: Could not find file '{filename}'") + sys.exit(1) + +def send_question(user_message, endpoint): + """Send a single question to the API endpoint.""" + + try: + question = urllib.parse.quote(f"{user_message}") + response = requests.get(f"{endpoint}/answer/{question}") + if response.status_code == 200: + return response.json().get("answer", "Could not fetch response.") + else: + return "API Error: Unable to fetch response." + except requests.RequestException: + return "API Error: Failed to connect to the backend service." + + '''try: + response = requests.post( + endpoint, + json={"question": question}, + headers={"Content-Type": "application/json"} + ) + response.raise_for_status() + return response.json().get('answer', 'No answer provided') + except requests.exceptions.RequestException as e: + return f"Error: {str(e)}" + ''' + +def save_results(results, output_filename): + """Save results to a file in a readable format.""" + timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + filename = f"{output_filename}_{timestamp}.txt" + + with open(filename, 'w', encoding='utf-8') as file: + file.write("Q&A Results\n") + file.write("=" * 80 + "\n\n") + + for i, (question, answer) in enumerate(results, 1): + file.write(f"Question {i}:\n") + file.write("-" * 40 + "\n") + file.write(f"{question}\n\n") + file.write("Answer:\n") + file.write("-" * 40 + "\n") + file.write(f"{answer}\n\n") + file.write("=" * 80 + "\n\n") + + return filename + +def main(): + # Configuration + RAG_LLM_QUERY_URL = os.getenv("RAG_LLM_QUERY_URL") + INPUT_FILE = "mini_rag_questions.txt" # Replace with your questions file name + OUTPUT_FILE_PREFIX = "qa_results" + + # Read questions + print("Reading questions from file...") + questions = read_questions(INPUT_FILE) + print(f"Found {len(questions)} questions") + + # Process questions + results = [] + for i, question in enumerate(questions, 1): + print(f"Processing question {i} of {len(questions)}...") + + # Send request and get response + answer = send_question(question, RAG_LLM_QUERY_URL) + results.append((question, answer)) + + time.sleep(0.5) + + # Save results + output_file = save_results(results, OUTPUT_FILE_PREFIX) + print(f"\nResults have been saved to: {output_file}") + +if __name__ == "__main__": + main() diff --git a/scripts/jira_config.ini b/scripts/jira_config.ini new file mode 100644 index 0000000..45e1ced --- /dev/null +++ b/scripts/jira_config.ini @@ -0,0 +1,39 @@ +# Configuration file for Jira embedding preparation. +# Specify prefixes, substrings, list fields, and other mappings for processing Jira data. + +[PrefixFields] +# Fields with column names starting with the specified prefixes. +# Example: Columns named "Comment1", "Comment2" will be grouped into "comments_text". +comments_text = Comment +attachments_text = Attachment + +[SubstringFields] +# Fields with column names containing the specified substrings (case-insensitive). +# Example: Columns containing "issue link" in their name will be grouped into "linked_issues_text". +linked_issues_text = issue link + +[ListFields] +# Fields stored as comma-separated values or lists in a single column. +# These will be split into lists of individual items and included in composite text. +fields = Components, Labels + +[CompositeTextFields] +# Fields to include in the composite text (document body). +# Each key is a user-friendly label, and the value is the corresponding column in the DataFrame. +# NOTE: Fields from [PrefixFields], [SubstringFields], and [ListFields] will also be included in composite text. +Title = Summary +Description = Description +Status = Status +Type = Issue Type +Priority = Priority + +[MetadataFields] +# Fields to include in the document metadata. +# Each key is a metadata field name, and the value is the corresponding column in the DataFrame. +key = Issue key +type = Issue Type +status = Status + +[TicketUrl] +jira_url = https://elotl.atlassian.net/browse/ +metadata_field = key \ No newline at end of file diff --git a/scripts/jira_csv_local_developement.md b/scripts/jira_csv_local_developement.md new file mode 100644 index 0000000..81e2c68 --- /dev/null +++ b/scripts/jira_csv_local_developement.md @@ -0,0 +1,59 @@ +# How to run the full process for jira csv locally + +## Make sure You have +- [UV installed](https://docs.astral.sh/uv/getting-started/installation/) +- [Ollama cli installed](https://ollama.com/download) + +## Prepare data + +Go to `GenAI-infra-stack/scripts` create venv and install deps +```shell +uv venv +source .venv/bin/activate +uv pip install -r requirements.txt +``` +set jira_url in jira_config.ini + +```shell +uv run process_jira_tickets.py jira_elotl.csv jira_config.ini output_files +``` + +## Create vector store +Go to `GenAI-infra-stack/dockers/llm.vdb.service` + +```shell +uv run createvectordb_jira_csv_local.py ../../scripts/output_files pickled.obj +``` + +## Run rag app from local pickled.obj + +Run Ollama Open Api compatible local model +https://ollama.com/blog/openai-compatibility + +Go to `GenAI-infra-stack/dockers/llm.rag.service` + +```shell +export FILE_PATH="../llm.vdb.service/pickled.obj" + +uv run serverragllm_jira_cvs_local.py +``` + +## Test setup +```shell +curl "http://127.0.0.1:8000/answer/How%20to%20install%20luna%20?" +``` + +## Run UI +Go to `GenAI-infra-stack/dockers/llm.chatui.service` create venv and install requirements. + +```shell +uv venv +source .venv/bin/activate +uv pip install -r requirements.txt +``` + +```shell +export RAG_LLM_QUERY_URL="http://127.0.0.1:8000" + +uv run simple_chat.py +``` \ No newline at end of file diff --git a/scripts/json_to_csv/convert_json_to_csv.py b/scripts/json_to_csv/convert_json_to_csv.py new file mode 100644 index 0000000..a318c7c --- /dev/null +++ b/scripts/json_to_csv/convert_json_to_csv.py @@ -0,0 +1,182 @@ +import json +import csv +from datetime import datetime +import io + +def clean_text(text): + """Clean text by removing extra whitespace and newlines""" + if text is None: + return "" + return ' '.join(str(text).split()).strip() + +def safe_get (obj, *keys, default = ""): + """Safely get nested dictionary values with a default if not found""" + try: + for key in keys: + if not isinstance(obj, dict): + return default + newobj = obj.get(key) + if newobj is None: + return default + return newobj + except Exception: + return default + +def format_comment(comment): + """ Format a single comment with proper escaping""" + if not comment: + return "" + try: + parts =[ + f"ID={safe_get(comment, 'id', default='N/A')}", + f"Author={safe_get(comment, 'author_id', default='N/A')}", + f"Time={safe_get(comment, 'created_at', default='N/A')}", + f"Public={safe_get(comment, 'public', default='N/A')}", + f"Body={clean_text(safe_get(comment, 'body', default=''))}" + ] + return "; ".join(parts) + except Exception as e: + print (f"Warning: Error formatting comment: {str(e)}") + return "" + + +def format_comments(comments): + """Format all comments into a single string with proper escaping """ + if not comments: + return "" + + formatted=[] + for comment in comments: + comment_str = format_comment(comment) + if comment_str: + formatted.append(comment_str) + + return " || ".join(formatted) + +def process_ticket(ticket_data): + """ Process a single ticket and return a row dictionary """ + if not ticket_data: + return None + + try: + # Process tags with proper escaping + tags = safe_get(ticket_data, 'tags', default =[]) + tags_str = ';'.join(str(tag) for tag in tags if tag is not None) + + return { + 'ticket_id':safe_get(ticket_data, 'id'), + 'ticket_url':safe_get(ticket_data, 'url'), + 'ticket_type':clean_text(safe_get(ticket_data, 'type')), # renaming for clarity + 'subject':clean_text(safe_get(ticket_data, 'subject')), + 'description':clean_text(safe_get(ticket_data, 'description')), + 'details': format_comments(safe_get(ticket_data, 'comments', default =[])), # renaming for clarity + 'created_at':safe_get(ticket_data, 'created_at'), + 'updated_at':safe_get(ticket_data, 'updated_at'), + 'latest_comment_added_at':safe_get(ticket_data, 'latest_comment_added_at'), + 'status':safe_get(ticket_data, 'status'), + 'priority':safe_get(ticket_data, 'priority'), + 'requester_name':safe_get(ticket_data, 'requester', 'name'), + 'requester_email':safe_get(ticket_data, 'requester', 'email'), + 'assignee_name':safe_get(ticket_data, 'assignee', 'name'), + 'assignee_email':safe_get(ticket_data, 'assignee', 'email'), + 'submitter_name':safe_get(ticket_data, 'submitter', 'name'), + 'submitter_email':safe_get(ticket_data, 'submitter', 'email'), + 'organization_name':safe_get(ticket_data, 'organization', 'name'), + 'group_name':safe_get(ticket_data, 'group', 'name'), + 'collaborator_name':safe_get(ticket_data, 'collaborator', 'name'), + 'collaborator_email':safe_get(ticket_data, 'collaborator', 'email'), + 'tags':tags_str, + 'satisfaction_rating_score':safe_get(ticket_data, 'satisfaction_rating', 'score'), + 'number_of_reopens':safe_get(ticket_data, 'metric_set', 'reopens'), + 'number_of_replies':safe_get(ticket_data, 'metric_set', 'replies'), + 'reply_time_in_minutes':safe_get(ticket_data, 'metric_set', 'reply_time_in_minutes', 'business'), + 'full_resolution_time_in_minutes':safe_get(ticket_data, 'metric_set', 'full_resolution_time_in_minutes', 'business'), + + + } + except Exception as e: + print(f"Warning: Error processing ticket: {str(e)}") + return None + +def convert_zendesk_json_to_csv(json_file_path, csv_file_path): + # Define CSV headers + headers=[ + 'ticket_id', + 'ticket_url', + 'ticket_type', + 'subject', + 'description', + 'details', + 'created_at', + 'updated_at', + 'latest_comment_added_at', + 'status', + 'priority', + 'requester_name', + 'requester_email', + 'assignee_name', + 'assignee_email', + 'submitter_name', + 'submitter_email', + 'collaborator_name', + 'collaborator_email', + 'organization_name', + 'group_name', + 'tags', + 'satisfaction_rating_score', + 'number_of_reopens', + 'number_of_replies', + 'reply_time_in_minutes', + 'full_resolution_time_in_minutes', + ] + # Read and process JSON file + rows =[] + skipped_tickets = 0 + total_tickets = 0 + + with open(json_file_path, 'r', encoding = 'utf-8') as file: + for line in file: + total_tickets += 1 + try: + #Try to parse each line as a separate JSON object + ticket_data = json.loads(line.strip()) + processed_ticket = process_ticket(ticket_data) + if processed_ticket: + rows.append(processed_ticket) + else: + skipped_tickets += 1 + except json.JSONDecodeError as e: + print(f"Warning: Skipping invalid JSON line: {str(e)}") + skipped_tickets += 1 + continue + + if not rows: + print("No valid tickets found in the input file") + return + + # Write to CSV with proper escaping + with open(csv_file_path, 'w', newline = '', encoding = 'utf-8') as file: + writer = csv.DictWriter(file, + fieldnames = headers, + quoting = csv.QUOTE_ALL, + #Quote all fields + escapechar = '\\', + #Use backslash as escape character + doublequote = True) + # Double-quotes within fields + writer.writeheader() + writer.writerows(rows) + print(f"CSV file has been created successfully at: {csv_file_path}") + print(f"Processed {len(rows)} tickets successfully") + print(f"Skipped {skipped_tickets} tickets due to errors") + print(f"Total tickets in input: {total_tickets}") + +if __name__ == "__main__": + + json_file_path = "zendesk_tickets.json" + csv_file_path = "zendesk_tickets.csv" + + try: + convert_zendesk_json_to_csv(json_file_path, csv_file_path) + except Exception as e: + print(f"An error occurred: {str(e)}") diff --git a/scripts/process_jira_tickets.py b/scripts/process_jira_tickets.py new file mode 100644 index 0000000..895ca9f --- /dev/null +++ b/scripts/process_jira_tickets.py @@ -0,0 +1,247 @@ +import os +import json +from configparser import ConfigParser +from typing import Any, Dict, List + +import click +import pandas as pd + + +def preprocess_text(text: Any) -> str: + """Clean and standardize text content.""" + if pd.isna(text): + return "" + return str(text).strip().replace("\n", " ").replace("\r", " ") + + +def get_list_from_field(value: Any) -> List[str]: + """Safely convert a field value to a list of strings.""" + if pd.isna(value): + return [] + if isinstance(value, str): + return [item.strip() for item in value.split(",") if item.strip()] + if isinstance(value, (list, tuple)): + return [str(item) for item in value if pd.notna(item)] + return [str(value)] + + +def extract_prefixed_columns( + row: pd.Series, df_columns: List[str], prefix: str +) -> List[str]: + """Extract and concatenate non-empty values from columns starting with a prefix.""" + return [ + str(row[col]) + for col in df_columns + if col.startswith(prefix) and pd.notna(row[col]) + ] + + +def extract_containing_columns( + row: pd.Series, df_columns: List[str], substring: str +) -> List[str]: + """Extract and concatenate non-empty values from columns containing a specific substring.""" + return [ + str(row[col]) + for col in df_columns + if substring in col.lower() and pd.notna(row[col]) + ] + + +def extract_composite_text( + row: pd.Series, composite_text_fields: Dict[str, str] +) -> str: + """Extract and format fields for composite text.""" + parts = [ + f"{field.lower()}: {preprocess_text(row.get(column, ''))}" + for field, column in composite_text_fields.items() + ] + return "\n".join(parts) + + +def extract_metadata(row: pd.Series, metadata_fields: Dict[str, str]) -> Dict[str, str]: + """Extract fields for metadata.""" + metadata = { + field.lower(): preprocess_text(row.get(column, "")) + for field, column in metadata_fields.items() + } + return metadata + + +def process_row( + row: pd.Series, + df_columns: List[str], + prefix_fields: Dict[str, str], + substring_fields: Dict[str, str], + list_fields: List[str], + composite_text_fields: Dict[str, str], + metadata_fields: Dict[str, str], + jira_url: str, + metadata_field: str, +) -> Dict[str, Any]: + """Process a single row using the provided configuration.""" + # Extract prefixed fields + prefixed_data = { + target_field.lower(): extract_prefixed_columns(row, df_columns, prefix) + for target_field, prefix in prefix_fields.items() + } + + # Extract substring fields + substring_data = { + target_field.lower(): extract_containing_columns(row, df_columns, substring) + for target_field, substring in substring_fields.items() + } + + # Extract list fields + list_data = { + field.lower(): get_list_from_field(row.get(field, "")) for field in list_fields + } + + # Extract composite text + composite_text = extract_composite_text(row, composite_text_fields) + + # Add prefixed and substring fields to composite text + for field, values in prefixed_data.items(): + if values: + composite_text += f"\n{field}: {' '.join(values)}" + for field, values in substring_data.items(): + if values: + composite_text += f"\n{field}: {' '.join(values)}" + + # Add list fields to composite text + for field, values in list_data.items(): + if values: + composite_text += f"\n{field.lower()}: {', '.join(values)}" + + # Extract metadata + metadata = extract_metadata(row, metadata_fields) + + # Add source + metadata["source"] = jira_url + metadata[metadata_field] + + return {"text": composite_text, "metadata": metadata} + + +def prepare_data_for_embedding( + df: pd.DataFrame, + prefix_fields: Dict[str, str], + substring_fields: Dict[str, str], + list_fields: List[str], + composite_text_fields: Dict[str, str], + metadata_fields: Dict[str, str], + jira_url: str, + metadata_field: str, +): + """Prepare documents for embedding using the provided configuration.""" + documents = [] + + for _, row in df.iterrows(): + processed_data = process_row( + row=row, + df_columns=df.columns, + prefix_fields=prefix_fields, + substring_fields=substring_fields, + list_fields=list_fields, + composite_text_fields=composite_text_fields, + metadata_fields=metadata_fields, + jira_url=jira_url, + metadata_field=metadata_field, + ) + documents.append(processed_data) + + return documents + + +def load_config(config_file): + config = ConfigParser() + config.read(config_file) + + # Parse configuration sections into variables + prefix_fields = dict(config["PrefixFields"]) if "PrefixFields" in config else {} + substring_fields = ( + dict(config["SubstringFields"]) if "SubstringFields" in config else {} + ) + + # Handle ListFields specially since we need to split the string + list_fields = [] + if "ListFields" in config and "fields" in config["ListFields"]: + fields_str = config["ListFields"]["fields"] + if fields_str: + list_fields = [f.strip() for f in fields_str.split(",")] + + composite_text_fields = ( + dict(config["CompositeTextFields"]) if "CompositeTextFields" in config else {} + ) + metadata_fields = ( + dict(config["MetadataFields"]) if "MetadataFields" in config else {} + ) + + return { + "prefix_fields": prefix_fields, + "substring_fields": substring_fields, + "list_fields": list_fields, + "composite_text_fields": composite_text_fields, + "metadata_fields": metadata_fields, + "jira_url": config["TicketUrl"]["jira_url"], + "metadata_field": config["TicketUrl"]["metadata_field"], + } + + +def save_to_json_files(data: List[Dict], output_dir: str): + """Save each dictionary in the list as a separate JSON file in the specified directory.""" + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + for i, item in enumerate(data): + output_file = os.path.join(output_dir, f"item_{i}.json") + with open(output_file, "w", encoding="utf-8") as f: + json.dump(item, f, ensure_ascii=False) + + +@click.command() +@click.argument("input_file", type=click.Path(exists=True)) +@click.argument("config_file", type=click.Path(exists=True)) +@click.argument("output_dir", type=click.Path()) +def process_data(input_file: str, config_file: str, output_dir: str): + """ + Process data from INPUT_FILE using CONFIG_FILE and save to OUTPUT_FILE. + + The input file should be a CSV file containing the data to process. + The config file should be an INI file with the processing configuration. + The output file will be saved in JSON format. + """ + # Load configuration + click.echo(f"Loading configuration from {config_file}") + config = load_config(config_file) + + # Get configuration sections into variables + prefix_fields = config["prefix_fields"] + substring_fields = config["substring_fields"] + list_fields = config["list_fields"] + composite_text_fields = config["composite_text_fields"] + metadata_fields = config["metadata_fields"] + + # Load DataFrame + click.echo(f"Reading data from {input_file}") + df = pd.read_csv(input_file) + + # Prepare embedding data + click.echo("Processing data...") + embedding_data = prepare_data_for_embedding( + df=df, + prefix_fields=prefix_fields, + substring_fields=substring_fields, + list_fields=list_fields, + composite_text_fields=composite_text_fields, + metadata_fields=metadata_fields, + jira_url=config["jira_url"], + metadata_field=config["metadata_field"], + ) + + # Save to file + click.echo(f"Saving processed data to {output_dir}") + save_to_json_files(embedding_data, output_dir) + click.echo(f"Processed {len(embedding_data)} documents") + + +if __name__ == "__main__": + process_data() diff --git a/scripts/process_sitemap.py b/scripts/process_sitemap.py new file mode 100644 index 0000000..44e7118 --- /dev/null +++ b/scripts/process_sitemap.py @@ -0,0 +1,50 @@ + +import click +import json +import os + +from langchain_community.document_loaders.sitemap import SitemapLoader +from urllib.parse import urlparse + + +def build_filename_from_url(url: str, extension: str = "json") -> str: + parsed_url = urlparse(url) + # Use the netloc and path as base components + base_name = f"{parsed_url.netloc}{parsed_url.path}".replace("/", "_").strip("_") + # Remove query parameters and fragments if present + base_name = base_name.split("?")[0].split("#")[0] + # Construct the filename + filename = f"{base_name}.{extension}" + # Ensure filename is valid on the filesystem + return "".join(c if c.isalnum() or c in "._-" else "_" for c in filename) + + +@click.command() +@click.argument("input_url") +@click.argument("output_dir", type=click.Path()) +def process_data(input_url: str, output_dir: str): + """ + Process data from INPUT_URL and save to OUTPUT_FILE. + """ + sitemap_loader = SitemapLoader( + web_path=input_url, filter_urls=["^((?!.*/v.*).)*$"] + ) + sitemap_loader.requests_per_second = 1 + docs = sitemap_loader.load() + print("Count of sitemap docs loaded:", len(docs)) + + for doc in docs: + data = { + "text": doc.page_content, + # source key exists in doc metadata + "metadata": doc.metadata, + } + + output_filename = build_filename_from_url(doc.metadata["source"]) + output_path = os.path.join(output_dir, output_filename) + with open(output_path, 'w', encoding='utf-8') as json_file: + json.dump(data, json_file, indent=4) + + +if __name__ == "__main__": + process_data() \ No newline at end of file diff --git a/scripts/process_txt_files.py b/scripts/process_txt_files.py new file mode 100644 index 0000000..0b0fe61 --- /dev/null +++ b/scripts/process_txt_files.py @@ -0,0 +1,35 @@ +import json +import os + +import click + +@click.command() +@click.argument("input_dir", type=click.Path(exists=True)) +@click.argument("output_dir", type=click.Path()) +def process_data(input_dir: str, output_dir: str): + """ + Process data from INPUT_FILE and save to OUTPUT_FILE. + """ + for filename in os.listdir(input_dir): + if not filename.endswith(".txt"): + continue + + file_path = os.path.join(input_dir, filename) + with open(file_path, 'r', encoding='utf-8') as file: + text_content = file.read() + data = { + "text": text_content, + "metadata": { + "source": filename + } + } + + output_filename = os.path.splitext(filename)[0] + ".json" + output_path = os.path.join(output_dir, output_filename) + with open(output_path, 'w', encoding='utf-8') as json_file: + json.dump(data, json_file, indent=4) + + + +if __name__ == "__main__": + process_data() diff --git a/scripts/process_zendesk_tickets.py b/scripts/process_zendesk_tickets.py new file mode 100644 index 0000000..63afc43 --- /dev/null +++ b/scripts/process_zendesk_tickets.py @@ -0,0 +1,287 @@ +import os +import re +import json +from configparser import ConfigParser +from typing import Any, Dict, List, Union + +import click + + +def clean_text(text: Any) -> str: + """Clean and standardize text content.""" + if text is None or text == "": + return "" + result = str(text).strip().replace("\n", " ").replace("\r", " ") + result = re.sub(r'-{3,}', '-', result) + result = re.sub(r'#+', '', result) + result = re.sub(r'\*+', '', result) + result = re.sub(r'>+', '', result) + result = re.sub(r'<+', '', result) + result = re.sub(r'\s+', ' ', result) + result = result.replace("[]", " ") + result = result.replace("[ ]", " ") + return re.sub(r'\s+', ' ', result) + + +def get_nested_value(data: Dict[str, Any], field_path: str) -> Any: + """Extract value from nested dictionary using dot notation. + + Args: + data: Dictionary containing the data + field_path: Path to the field using dot notation (e.g., "submitter.name") + + Returns: + The value at the specified path or None if not found + """ + current = data + parts = field_path.split('.') + + for part in parts: + if isinstance(current, dict): + current = current.get(part) + elif isinstance(current, list) and current: + # If it's a list, try to get the first item's attribute + if isinstance(current[0], dict): + current = current[0].get(part) + else: + return None + else: + return None + + if current is None: + return None + + return current + +def parse_list(value: Any) -> List[str]: + """Convert a field value to a list of strings.""" + if not value: # Handles None, empty string, empty list + return [] + if isinstance(value, str): + return [item.strip() for item in value.split(",") if item.strip()] + if isinstance(value, (list, tuple)): + # Handle cases where list items might be dictionaries + processed_items = [] + for item in value: + if isinstance(item, dict): + # Extract name or id from dictionary + item_str = item.get('name', item.get('id', '')) + if item_str: + processed_items.append(str(item_str)) + elif item: + processed_items.append(str(item)) + return processed_items + return [str(value)] + +def extract_field_values( + data: Dict[str, Any], + keys: List[str], + matcher: callable +) -> List[str]: + """Extract values from fields that match a given condition.""" + return [ + str(data[key]) + for key in keys + if key in data and data[key] and matcher(key) + ] + +def extract_comments_text(comments: List[Dict[str, Any]]) -> str: + """Extract readable text from comments array.""" + if not comments: + return "" + + comment_texts = [] + for comment in comments: + if comment.get("public", True): # Only include public comments + author_name = "" + if comment.get("author_id"): + # You might want to add author mapping here + author_name = f"Comment {comment['id']}" + + body = comment.get("body", "").strip() + if body: + comment_texts.append(f"{author_name}: {body}") + + return "\n".join(comment_texts) + +def process_item( + data: Dict[str, Any], + keys: List[str], + config: Dict[str, Any] +) -> Dict[str, Any]: + """Process a single JSON item using the provided configuration.""" + # Extract data using different matching criteria + result_text = [] + + # First process title and description + priority_fields = ['Title', 'Description', "Ticket"] + for field in priority_fields: + if field in config["composite_text_fields"]: + column = config["composite_text_fields"][field] + value = get_nested_value(data, column) if '.' in column else data.get(column) + if value: + result_text.append(f"{field.lower()}: {clean_text(value)}") + + # Then process other composite text fields + for field, column in config["composite_text_fields"].items(): + if field not in priority_fields: # Skip title and description as they're already processed + value = get_nested_value(data, column) if '.' in column else data.get(column) + if value: + result_text.append(f"{field.lower()}: {clean_text(value)}") + + # Process comments last + # if "comments" in data and isinstance(data["comments"], list): + # comments_text = extract_comments_text(data["comments"]) + # if comments_text: + # result_text.append(f"comments: {comments_text}") + + # Process composite text fields + #for field, column in config["composite_text_fields"].items(): + # value = get_nested_value(data, column) if '.' in column else data.get(column) + # if value: + # result_text.append(f"{field.lower()}: {clean_text(value)}") + + # Process prefix fields + for target_field, prefix in config["prefix_fields"].items(): + if '.' in prefix: + # Handle nested prefix fields + parent, child = prefix.split('.', 1) + if parent in data and isinstance(data[parent], list): + values = [item.get(child, '') for item in data[parent] if item.get(child)] + if values: + result_text.append(f"{target_field.lower()}: {' '.join(map(clean_text, values))}") + else: + values = extract_field_values( + data, keys, + lambda k: k.startswith(prefix) + ) + if values: + result_text.append(f"{target_field.lower()}: {' '.join(values)}") + + # Process substring fields + for target_field, substring in config["substring_fields"].items(): + values = extract_field_values( + data, keys, + lambda k: substring in k.lower() + ) + if values: + result_text.append(f"{target_field.lower()}: {' '.join(values)}") + + # Build metadata + metadata = {} + for field, column in config["metadata_fields"].items(): + if '.' in column: + # Handle nested fields + parent, child = column.split('.', 1) + if parent in data and isinstance(data[parent], dict): + value = data[parent].get(child, "") + else: + value = "" + else: + value = data.get(column, "") + metadata[field.lower()] = clean_text(value) + + # Process list fields + for field in config["list_fields"]: + if field in data: + values = parse_list(data[field]) + if values: + metadata[field.lower()] = ", ".join(values) + + # Add source URL + metadata_unique_id = config["metadata_unique_id"] + metadata["source"] = config["zendesk_url"] + metadata[metadata_unique_id] + ".json" + + return { + "text": ". ".join(result_text), + "metadata": metadata + } + +def prepare_documents( + data: Union[Dict[str, Any], List[Dict[str, Any]]], + config: Dict[str, Any] +) -> List[Dict[str, Any]]: + """Prepare documents for embedding using the provided configuration.""" + # Ensure data is a list + items = [data] if isinstance(data, dict) else data + + # Get all unique keys from the JSON data + keys = sorted({key for item in items for key in item}) + + # Process each item + return [process_item(item, keys, config) for item in items] + +def load_config(config_file: str) -> Dict[str, Any]: + """Load and parse configuration from INI file.""" + config = ConfigParser() + config.read(config_file) + + return { + "prefix_fields": dict(config["PrefixFields"]) if "PrefixFields" in config else {}, + "substring_fields": dict(config["SubstringFields"]) if "SubstringFields" in config else {}, + "list_fields": [ + f.strip() + for f in config.get("ListFields", "fields", fallback="").split(",") + if f.strip() + ], + "composite_text_fields": dict(config["CompositeTextFields"]) if "CompositeTextFields" in config else {}, + "metadata_fields": dict(config["MetadataFields"]) if "MetadataFields" in config else {}, + "zendesk_url": config.get("TicketUrl", "zendesk_url", fallback=""), + "metadata_unique_id": config.get("TicketUrl", "metadata_unique_id", fallback=""), + } + +def save_documents(documents: List[Dict], output_dir: str) -> None: + """Save documents as individual JSON files.""" + os.makedirs(output_dir, exist_ok=True) + + for i, doc in enumerate(documents): + output_file = os.path.join(output_dir, f"item_{i}.json") + with open(output_file, "w", encoding="utf-8") as f: + json.dump(doc, f, ensure_ascii=False, indent=2) + +@click.command() +@click.argument("input_file", type=click.Path(exists=True)) +@click.argument("config_file", type=click.Path(exists=True)) +@click.argument("output_dir", type=click.Path()) +def main(input_file: str, config_file: str, output_dir: str) -> None: + """ + Process JSON data using the provided configuration and save results. + + INPUT_FILE: JSON file containing the data to process + CONFIG_FILE: INI configuration file + OUTPUT_DIR: Directory to save the processed documents + """ + # Load configuration + click.echo(f"Loading configuration from {config_file}") + config = load_config(config_file) + + # Load JSON data + click.echo(f"Reading data from {input_file}") + data = [] + with open(input_file, encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: # Skip empty lines + try: + json_obj = json.loads(line) + data.append(json_obj) + except json.JSONDecodeError as e: + click.echo(f"Warning: Skipping invalid JSON line: {str(e)}") + continue + + if not data: + click.echo("Error: No valid JSON objects found in the input file") + return + + click.echo(f"Found {len(data)} JSON objects") + + # Process documents + click.echo("Processing documents...") + documents = prepare_documents(data, config) + + # Save results + click.echo(f"Saving {len(documents)} documents to {output_dir}") + save_documents(documents, output_dir) + +if __name__ == "__main__": + main() diff --git a/scripts/query/query.py b/scripts/query/query.py new file mode 100644 index 0000000..2176aa2 --- /dev/null +++ b/scripts/query/query.py @@ -0,0 +1,31 @@ +from openai import OpenAI +import os + +query = input("Type your query here: ") +modelid = os.getenv("MODEL_ID", "microsoft/Phi-3-mini-4k-instruct") +modelurl = os.getenv("MODEL_URL", "http://localhost:9000/v1") + +# Note: Ray Serve doesn't support all OpenAI client arguments and may ignore some. +client = OpenAI( + # Replace the URL if deploying your app remotely + # (e.g., on Anyscale or KubeRay). + base_url=modelurl, + api_key="NOT A REAL KEY", +) +chat_completion = client.chat.completions.create( + model=modelid, + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": query, + }, + ], + temperature=0.01, + stream=True, +) + +for chat in chat_completion: + if chat.choices[0].delta.content is not None: + print(chat.choices[0].delta.content, end="") +print("") diff --git a/scripts/query/query_private_data.py b/scripts/query/query_private_data.py new file mode 100644 index 0000000..2d49df6 --- /dev/null +++ b/scripts/query/query_private_data.py @@ -0,0 +1,25 @@ +import requests +import urllib.parse +import json + +def main(): + hostname = "localhost:8000" + query = input("Type your query here: ") + + # Encode the question using urllib.parse + encoded_question = urllib.parse.quote(query) + + url = f"http://{hostname}/answer/{encoded_question}" + + response = requests.get(url) + + if response.status_code == 200: + response = response.text.strip() + data = json.loads(response) + answer = data['answer'] + print(f"Answer: {answer}") + else: + print(f"Error: {response.status_code} - {response.text}") + +if __name__ == "__main__": + main() diff --git a/scripts/query/requirements.txt b/scripts/query/requirements.txt new file mode 100644 index 0000000..41c6a85 --- /dev/null +++ b/scripts/query/requirements.txt @@ -0,0 +1,2 @@ +requests +openai diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 0000000..867c9b1 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1,3 @@ +click +pandas +langchain_community \ No newline at end of file diff --git a/scripts/zendesk_config.ini b/scripts/zendesk_config.ini new file mode 100644 index 0000000..27eb53e --- /dev/null +++ b/scripts/zendesk_config.ini @@ -0,0 +1,30 @@ +[PrefixFields] +comments_body = comments.body + +[SubstringFields] + +[ListFields] +fields = tags + +[CompositeTextFields] +Description = description + +[MetadataFields] +ticket = id +title = subject +status = status +type = type +priority = priority +collaborator = collaborator.name +requester = requester.name +submitter = submitter.name +assignee = assignee.name +organization = organization.name +group = group.name +created_at = created_at +updated_at = updated_at +url = url + +[TicketUrl] +zendesk_url = https://zendesk.com/api/v2/tickets/ +metadata_unique_id = ticket