uploading rag demo

franciscojavierarceo · franciscojavierarceo · commit e8c388261a86 · 2024-04-16T15:03:01.000-04:00
Signed-off-by: Francisco Javier Arceo &lt;farceo@redhat.com&gt;
diff --git a/module_4_rag/Dockerfile b/module_4_rag/Dockerfile
@@ -0,0 +1,37 @@
+FROM python:3.9
+
+# Set environment varibles
+ENV PYTHONDONTWRITEBYTECODE 1
+ENV PYTHONUNBUFFERED 1
+
+# Set work directory
+WORKDIR /code
+
+
+# Install dependencies
+RUN LIBMEMCACHED=/opt/local
+RUN apt-get update && apt-get install -y \
+        libmemcached11 \
+        libmemcachedutil2 \
+        libmemcached-dev \
+        libz-dev \
+        curl \
+        gettext
+
+ENV PYTHONHASHSEED=random \
+  PIP_NO_CACHE_DIR=off \
+  PIP_DISABLE_PIP_VERSION_CHECK=on \
+  PIP_DEFAULT_TIMEOUT=100 \
+  # Poetry's configuration: \
+  POETRY_NO_INTERACTION=1 \
+  POETRY_VIRTUALENVS_CREATE=false \
+  POETRY_CACHE_DIR='/var/cache/pypoetry' \
+  POETRY_HOME='/usr/local' \
+  POETRY_VERSION=1.4.1
+
+RUN curl -sSL https://install.python-poetry.org | python3 - --version $POETRY_VERSION
+
+COPY pyproject.toml poetry.lock /code/
+RUN poetry install --no-interaction --no-ansi --no-root
+
+COPY . ./code/
diff --git a/module_4_rag/README.md b/module_4_rag/README.md
@@ -0,0 +1,64 @@
+This is a demo to show how you can use Feast to do RAG
+
+## Installation via PyEnv and Poetry
+
+This demo assumes you have Pyenv (2.3.10) and Poetry (1.4.1) installed on your machine as well as Python 3.9.
+
+```bash
+pyenv local 3.9
+poetry shell
+poetry install
+```
+## Setting up the data and Feast
+
+To fetch the data simply run
+```bash
+python pull_states.py
+```
+Which will output a file called `city_wikipedia_summaries.csv`.
+
+Then run
+```bash
+python batch_score_documents.py
+```
+
+# Overview
+
+The overview is relatively simple, the goal is to define an architecture
+to support the following:
+
+```mermaid
+flowchart TD;
+    A[Pull Data] --> B[Batch Score Embeddings];
+    B[Batch Score Embeddings] --> C[Materialize Online];
+    C[Materialize Online] --> D[Retrieval Augmented Generation];
+    D[Retrieval Augmented Generation] --> E[Store User Interaction];
+    E[Store User Interaction] --> F[Update Training Labels];
+    F[Update Training Labels] --> H[Fine Tuning];
+    H[Fine Tuning] -. Backpropagate .-> B[Batch Score Embeddings];
+```
+
+
+A simple example of the user experience:
+
+```
+Q: Can you tell me about Chicago?
+A: Here's some wikipedia facts about Chicago...
+```
+
+# Limitations
+A common issue with RAG and LLMs is hallucination. There are two common
+approaches:
+
+1. Prompt engineering
+- This approach is the most obvious but is susceptible to prompt injection
+
+2. Build a Classifier to return the "I don't know" response
+- This approach is less obvious, requires another model, more training data,
+and fine tuning
+
+We can, in fact, use both approaches to further attempt to minimize the
+likelihood of prompt injection.
+
+This demo will display both.
+
diff --git a/module_4_rag/app.py b/module_4_rag/app.py
@@ -0,0 +1,99 @@
+from flask import (
+    Flask,
+    jsonify,
+    request,
+    render_template,
+)
+from flasgger import Swagger
+from datetime import datetime
+from get_features import (
+    get_onboarding_features,
+    get_onboarding_score,
+    get_daily_features,
+    get_daily_score,
+)
+from ml import make_risk_decision
+
+app = Flask(__name__)
+swagger = Swagger(app)
+
+
+@app.route("/")
+def onboarding_page():
+    return render_template("index.html")
+
+
+@app.route("/home")
+def home_page():
+    return render_template("home.html")
+
+
+@app.route("/onboarding-risk-features/", methods=["POST"])
+def onboarding_features():
+    """Example endpoint returning features by id
+    This is using docstrings for specifications.
+    ---
+    parameters:
+      - name: state
+        type: string
+        in: query
+        required: true
+        default: NJ
+
+      - name: ssn
+        type: string
+        in: query
+        required: true
+        default: 123-45-6789
+
+      - name: dl
+        type: string
+        in: query
+        required: true
+        default: some-dl-number
+
+      - name: dob
+        type: string
+        in: query
+        required: true
+        default: 12-23-2000
+    responses:
+      200:
+        description: A JSON of features
+        schema:
+          id: OnboardingFeatures
+          properties:
+            is_gt_18_years_old:
+              type: array
+              items:
+                schema:
+                  id: value
+                  type: number
+            is_valid_state:
+              type: array
+              items:
+                schema:
+                  id: value
+                  type: number
+            is_previously_seen_ssn:
+              type: array
+              items:
+                schema:
+                  id: value
+                  type: number
+            is_previously_seen_dl:
+              type: array
+              items:
+                schema:
+                  id: value
+                  type: number
+    """
+    r = request.args
+    feature_vector = get_onboarding_features(
+        r.get("state"), r.get("ssn"), r.get("dl"), r.get("dob")
+    )
+    return jsonify(feature_vector)
+
+
+if __name__ == "__main__":
+    app.run(debug=True)
diff --git a/module_4_rag/batch_score_documents.py b/module_4_rag/batch_score_documents.py
@@ -0,0 +1,45 @@
+import os 
+import pandas as pd
+from transformers import AutoTokenizer, AutoModel
+import torch
+import torch.nn.functional as F
+
+INPUT_FILENAME = "city_wikipedia_summaries.csv"
+EXPORT_FILENAME = "city_wikipedia_summaries_with_embeddings.csv"
+TOKENIZER = 'sentence-transformers/all-MiniLM-L6-v2'
+MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
+
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+
+def run_model(sentences, tokenizer, model):
+    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+    # Compute token embeddings
+    with torch.no_grad():
+        model_output = model(**encoded_input)
+
+    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+    return sentence_embeddings
+
+def score_data() -> None:
+    if EXPORT_FILENAME not in os.listdir():
+        print("scored data not found...generating embeddings...")
+        df = pd.read_csv(INPUT_FILENAME)
+        tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
+        model = AutoModel.from_pretrained(MODEL)
+        embeddings = run_model(df['Wiki Summary'].tolist(), tokenizer, model)
+        print(embeddings)
+        print('shape = ', df.shape)
+        df['Embeddings'] = list(embeddings.detach().cpu().numpy())
+        print("embeddings generated...")
+        print(df.head())
+        df.to_csv(EXPORT_FILENAME, index=False)
+        print("...data exported. job complete")
+    else:
+        print("scored data found...skipping generating embeddings.")
+
+if __name__ == '__main__':
+    score_data()
diff --git a/module_4_rag/docker-compose.yml b/module_4_rag/docker-compose.yml
@@ -0,0 +1,14 @@
+version: '3.9'
+
+services:
+    web:
+        env_file:
+            - .env
+        build: .
+        command:
+            - /bin/bash
+            - -c
+            - python3 /code/run.py
+
+        volumes:
+            - .:/code
diff --git a/module_4_rag/generate_random_questions.py b/module_4_rag/generate_random_questions.py
@@ -0,0 +1,40 @@
+import csv
+import random
+
+topics = ["science", "history", "technology", "mathematics", "geography", "literature", "sports", "art", "music", "cinema"]
+
+# Define a pattern for generating questions
+question_patterns = [
+    "What are the key principles of {}?",
+    "Who are the most influential figures in {}?",
+    "How has {} evolved over the years?",
+    "What are some common misconceptions about {}?",
+    "Can you explain the theory of {}?",
+    "What role does {} play in modern society?",
+    "How does {} affect our daily lives?",
+    "What are the future prospects of {}?",
+    "What are the major challenges in {} today?",
+    "How can one get started with {}?"
+]
+
+# Generate a list of 50 random questions
+questions = []
+for _ in range(50):
+    topic = random.choice(topics)
+    pattern = random.choice(question_patterns)
+    question = pattern.format(topic)
+    questions.append([question])
+
+
+def main():
+    # Define the file path
+    file_path = './random_questions.csv'
+
+    # Write the questions to a CSV file
+    with open(file_path, 'w', newline='') as file:
+        writer = csv.writer(file)
+        writer.writerow(["Question"])  # Writing header
+        writer.writerows(questions)
+
+if __name__ == "__main__":
+    main()
diff --git a/module_4_rag/pull_states.py b/module_4_rag/pull_states.py
@@ -0,0 +1,85 @@
+import os
+from typing import Dict, List
+import wikipedia as wiki
+import pandas as pd
+
+EXPORT_FILENAME = "city_wikipedia_summaries.csv"
+CITIES = [
+    "New York, New York",
+    "Los Angeles, California",
+    "Chicago, Illinois",
+    "Houston, Texas",
+    "Phoenix, Arizona",
+    "Philadelphia, Pennsylvania",
+    "San Antonio, Texas",
+    "San Diego, California",
+    "Dallas, Texas",
+    "San Jose, California",
+    "Austin, Texas",
+    "Jacksonville, Florida",
+    "Fort Worth, Texas",
+    "Columbus, Ohio",
+    "Charlotte, North Carolina",
+    "San Francisco, California",
+    "Indianapolis, Indiana",
+    "Seattle, Washington",
+    "Denver, Colorado",
+    "Washington, D.C.",
+    "Boston, Massachusetts",
+    "El Paso, Texas",
+    "Nashville, Tennessee",
+    "Detroit, Michigan",
+    "Oklahoma City, Oklahoma",
+    "Portland, Oregon",
+    "Las Vegas, Nevada",
+    "Memphis, Tennessee",
+    "Louisville, Kentucky",
+    "Baltimore, Maryland",
+    "Milwaukee, Wisconsin",
+    "Albuquerque, New Mexico",
+    "Tucson, Arizona",
+    "Fresno, California",
+    "Mesa, Arizona",
+    "Sacramento, California",
+    "Atlanta, Georgia",
+    "Kansas City, Missouri",
+    "Colorado Springs, Colorado",
+    "Miami, Florida",
+    "Raleigh, North Carolina",
+    "Omaha, Nebraska",
+    "Long Beach, California",
+    "Virginia Beach, Virginia",
+    "Oakland, California",
+    "Minneapolis, Minnesota",
+    "Tulsa, Oklahoma",
+    "Arlington, Texas",
+    "Tampa, Florida",
+    "New Orleans, Louisiana"
+]
+
+def get_wikipedia_summary(cities: List[str]) -> Dict[str, str]:
+    city_summaries = {}
+    for city in cities:
+        try:
+            city_summaries[city] = wiki.summary(city)
+        except:
+            print(f"error retrieving {city}")
+
+    return city_summaries
+
+
+def write_data(output_dict: Dict[str, str]) -> None:
+    df = pd.DataFrame([output_dict]).T.reset_index()
+    df.columns = ['State', 'Wiki Summary']
+    df.to_csv(EXPORT_FILENAME, index=False)
+
+def pull_state_data() -> None:
+    if EXPORT_FILENAME not in os.listdir():
+        print("data not found pullling wikipedia state summaries...")
+        city_summary_output = get_wikipedia_summary(CITIES)
+        write_data(city_summary_output)
+    else:
+        print("data already present...skipping download")
+
+if __name__ == "__main__":
+    pull_state_data()
diff --git a/module_4_rag/pyproject.toml b/module_4_rag/pyproject.toml
diff --git a/module_4_rag/run.py b/module_4_rag/run.py