feast-workshop/module_4_rag/batch_score_documents.py at main · feast-dev/feast-workshop

98 lines (81 loc) · 3.25 KB
import pandas as pd
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), "feature_repo"))
DATA_DIR = os.path.join(BASE_DIR, "data")
INPUT_FILENAME = os.path.join(DATA_DIR, "city_wikipedia_summaries.csv")
CHUNKED_FILENAME = os.path.join(DATA_DIR, "city_wikipedia_summaries_chunked.csv")
EXPORT_FILENAME = os.path.join(
    DATA_DIR, "city_wikipedia_summaries_with_embeddings.parquet"
TOKENIZER = "sentence-transformers/all-MiniLM-L6-v2"
MODEL = "sentence-transformers/all-MiniLM-L6-v2"
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[
    ]  # First element of model_output contains all token embeddings
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
def run_model(sentences, tokenizer, model):
    encoded_input = tokenizer(
        sentences, padding=True, truncation=True, return_tensors="pt"
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings
def score_data() -> None:
    os.makedirs(DATA_DIR, exist_ok=True)
    if not os.path.exists(EXPORT_FILENAME):
        print("Scored data not found... generating embeddings...")
        if not os.path.exists(CHUNKED_FILENAME):
            print("Chunked data not found... generating chunked data...")
            df = pd.read_csv(INPUT_FILENAME)
            df["Sentence Chunks"] = df["Wiki Summary"].apply(lambda x: sent_tokenize(x))
            chunked_df = df.explode("Sentence Chunks")
            chunked_df.to_csv(CHUNKED_FILENAME, index=False)
            df = chunked_df
        else:
            df = pd.read_csv(CHUNKED_FILENAME)
        tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
        model = AutoModel.from_pretrained(MODEL)
        embeddings = run_model(df["Sentence Chunks"].tolist(), tokenizer, model)
        print("embeddings generated...")
        df["id"] = [i for i in range(len(df))]
        df["Embeddings"] = list(embeddings.detach().cpu().numpy())
        df["event_timestamp"] = pd.to_datetime("today")
        df["item_id"] = df.index
        df = _rename_df(df)
        df.to_parquet(EXPORT_FILENAME, index=False)
        print("...data exported. Job complete")
        df = pd.read_parquet(EXPORT_FILENAME)
        print("Scored data found... skipping generating embeddings.")
    print("preview of data:")
    print(df.head().T)
def _rename_df(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = [c.replace(" ", "_").lower() for c in df.columns]
    df.rename({"embeddings": "vector"}, axis=1, inplace=True)
    df = df[
            "id",
            "item_id",
            "event_timestamp",
            "state",
            "wiki_summary",
            "sentence_chunks",
            "vector",
    return df
if __name__ == "__main__":
    score_data()
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

batch_score_documents.py

Latest commit

History

batch_score_documents.py

File metadata and controls