From d1ae0bc201ff2a66b490c86766d96449d999c0f7 Mon Sep 17 00:00:00 2001 From: alvaroof Date: Mon, 25 Dec 2023 11:08:45 +0000 Subject: [PATCH 1/7] Add openaiapi --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 7665a74..0a9c24f 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ python-dotenv = "0.20.0" radon = "5.1.0" semver = "2.13.0" snakeviz = "2.1.1" +openai = ">1" [tool.poetry.group.docs.dependencies] furo = "^2022.12.7" From 6053bae15b4c18164c176f22327431bb3c80fe80 Mon Sep 17 00:00:00 2001 From: alvaroof Date: Mon, 25 Dec 2023 11:20:42 +0000 Subject: [PATCH 2/7] Update gitignore --- .gitignore | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/.gitignore b/.gitignore index 831657a..03ffd2c 100755 --- a/.gitignore +++ b/.gitignore @@ -129,3 +129,18 @@ data/* # Testmon data file .testmondata + +# Common file formats +*.ipynb +*.html +*.png +*.yml +*.pdf +*.svg +*.eps +*.jpg +*.fig +*.tex + +# No notebooks +notebooks/ From 09049eccf053fb610bb23180081290cd2e270ac8 Mon Sep 17 00:00:00 2001 From: alvaroof Date: Mon, 25 Dec 2023 11:21:06 +0000 Subject: [PATCH 3/7] OpenAI Quickstart --- scripts/main.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 scripts/main.py diff --git a/scripts/main.py b/scripts/main.py new file mode 100644 index 0000000..8d67aea --- /dev/null +++ b/scripts/main.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +from dotenv import load_dotenv +from openai import OpenAI + +load_dotenv() + +if __name__ == "__main__": + client = OpenAI() + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + { + "role": "system", + "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.", + }, + { + "role": "user", + "content": "Compose a poem that explains the concept of recursion in programming.", + }, + ], + ) + + print(completion.choices[0].message) From d8109293ff826275da25618db79bc70d7d3c90c8 Mon Sep 17 00:00:00 2001 From: alvaroof Date: Thu, 28 Dec 2023 20:16:05 +0000 Subject: [PATCH 4/7] Add embeddingservice --- .../embeddingservice.py | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 scripts/question_answering_on_pdf/embeddingservice.py diff --git a/scripts/question_answering_on_pdf/embeddingservice.py b/scripts/question_answering_on_pdf/embeddingservice.py new file mode 100644 index 0000000..c81510b --- /dev/null +++ b/scripts/question_answering_on_pdf/embeddingservice.py @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- +from dotenv import load_dotenv + +load_dotenv() + +import numpy as np +import openai +from pypdf import PdfReader + + +class EmbeddingService: + def __init__(self, pdf_path: str): + self.openai_client = openai.OpenAI() + self.pdf_path = pdf_path + + self.parsed_chunks = None + self.embeddings = None + + def read_pdf(self, chunk_length: int): + reader = PdfReader(self.pdf_path) + chunks = [] + for page in reader.pages: + text_page = page.extract_text() + chunks.extend( + [ + text_page[i : i + chunk_length].replace("\n", " ") + for i in range(0, len(text_page), chunk_length) + ] + ) + self.parsed_chunks = chunks + + def get_embeddings(self, model="text-embedding-ada-002"): + chunks = self.parsed_chunks + if not isinstance(self.parsed_chunks, list): + chunks = [self.parsed_chunks] + self.embeddings = self.openai_client.embeddings.create( + input=chunks, model=model + ) # .data[0:len(chunks)].embedding + + def test_openai_client(self): + completion = self.openai_client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + { + "role": "system", + "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.", + }, + { + "role": "user", + "content": "Compose a poem that explains the concept of recursion in programming.", + }, + ], + ) + + print(completion.choices[0].message) + + +if __name__ == "__main__": + embed_service = EmbeddingService(pdf_path="data/pdf-example.pdf") + embed_service.read_pdf(1000) + print(embed_service.parsed_chunks[-1]) + embed_service.get_embeddings() + embeddings = embed_service.embeddings + parsed_chunks = embed_service.parsed_chunks + print( + f"We should have {len(embeddings.data)} different vector embeddings for {len(parsed_chunks)} different parsed chunks." + ) + + # df['ada_embedding'] = df.combined.apply(lambda x: get_embedding(x, model='text-embedding-ada-002')) + # df.to_csv('output/embedded_1k_reviews.csv', index=False) From 28c9ed66ac7dca54435bfb0c2b22a992b05af5cc Mon Sep 17 00:00:00 2001 From: Alvaro Ortiz Date: Wed, 10 Jan 2024 17:43:34 +0000 Subject: [PATCH 5/7] Add some code on question pdf answering program --- .../question_answering_on_pdf/dataservice.py | 120 ++++++++++++++++++ .../intentservice.py | 18 +++ .../responseservice.py | 18 +++ scripts/question_answering_on_pdf/run.py | 32 +++++ 4 files changed, 188 insertions(+) create mode 100644 scripts/question_answering_on_pdf/dataservice.py create mode 100644 scripts/question_answering_on_pdf/intentservice.py create mode 100644 scripts/question_answering_on_pdf/responseservice.py create mode 100644 scripts/question_answering_on_pdf/run.py diff --git a/scripts/question_answering_on_pdf/dataservice.py b/scripts/question_answering_on_pdf/dataservice.py new file mode 100644 index 0000000..31dd943 --- /dev/null +++ b/scripts/question_answering_on_pdf/dataservice.py @@ -0,0 +1,120 @@ +import numpy as np +import openai +from pypdf import PdfReader +from redis.commands.search.field import TextField, VectorField +from redis.commands.search.indexDefinition import IndexDefinition, IndexType +from redis.commands.search.query import Query + +import redis + +INDEX_NAME = "embeddings-index" # name of the search index +PREFIX = "doc" # prefix for the document keys +# distance metric for the vectors (ex. COSINE, IP, L2) +DISTANCE_METRIC = "COSINE" + +REDIS_HOST = "localhost" +REDIS_PORT = 6379 +REDIS_PASSWORD = "" + +class DataService(): + + def __init__(self): + # Connect to Redis + self.redis_client = redis.Redis( + host=REDIS_HOST, + port=REDIS_PORT, + password=REDIS_PASSWORD + ) + + def drop_redis_data(self, index_name: str = INDEX_NAME): + try: + self.redis_client.ft(index_name).dropindex() + print('Index dropped') + except: + # Index doees not exist + print('Index does not exist') + + def load_data_to_redis(self, embeddings): + # Constants + vector_dim = len(embeddings[0]['vector']) # length of the vectors + + # Initial number of vectors + vector_number = len(embeddings) + + # Define RediSearch fields + text = TextField(name="text") + text_embedding = VectorField("vector", + "FLAT", { + "TYPE": "FLOAT32", + "DIM": vector_dim, + "DISTANCE_METRIC": "COSINE", + "INITIAL_CAP": vector_number, + } + ) + fields = [text, text_embedding] + + # Check if index exists + try: + self.redis_client.ft(INDEX_NAME).info() + print("Index already exists") + except: + # Create RediSearch Index + self.redis_client.ft(INDEX_NAME).create_index( + fields=fields, + definition=IndexDefinition( + prefix=[PREFIX], index_type=IndexType.HASH) + ) + + for embedding in embeddings: + key = f"{PREFIX}:{str(embedding['id'])}" + embedding["vector"] = np.array( + embedding["vector"], dtype=np.float32).tobytes() + self.redis_client.hset(key, mapping=embedding) + print( + f"Loaded {self.redis_client.info()['db0']['keys']} documents in Redis search index with name: {INDEX_NAME}") + + def pdf_to_embeddings(self, pdf_path: str, chunk_length: int = 1000): + # Read data from pdf file and split it into chunks + reader = PdfReader(pdf_path) + chunks = [] + for page in reader.pages: + text_page = page.extract_text() + chunks.extend([text_page[i:i+chunk_length].replace('\n', '') + for i in range(0, len(text_page), chunk_length)]) + + # Create embeddings + response = openai.Embedding.create( + model='text-embedding-ada-002', input=chunks) + return [{'id': value['index'], 'vector':value['embedding'], 'text':chunks[value['index']]} for value in response['data']] + + def search_redis(self, + user_query: str, + index_name: str = "embeddings-index", + vector_field: str = "vector", + return_fields: list = ["text", "vector_score"], + hybrid_fields="*", + k: int = 5, + print_results: bool = False, + ): + # Creates embedding vector from user query + embedded_query = openai.Embedding.create(input=user_query, + model="text-embedding-ada-002", + )["data"][0]['embedding'] + # Prepare the Query + base_query = f'{hybrid_fields}=>[KNN {k} @{vector_field} $vector AS vector_score]' + query = ( + Query(base_query) + .return_fields(*return_fields) + .sort_by("vector_score") + .paging(0, k) + .dialect(2) + ) + params_dict = {"vector": np.array( + embedded_query).astype(dtype=np.float32).tobytes()} + # perform vector search + results = self.redis_client.ft(index_name).search(query, params_dict) + if print_results: + for i, doc in enumerate(results.docs): + score = 1 - float(doc.vector_score) + print(f"{i}. {doc.text} (Score: {round(score ,3) })") + return [doc['text'] for doc in results.docs] \ No newline at end of file diff --git a/scripts/question_answering_on_pdf/intentservice.py b/scripts/question_answering_on_pdf/intentservice.py new file mode 100644 index 0000000..c0820bb --- /dev/null +++ b/scripts/question_answering_on_pdf/intentservice.py @@ -0,0 +1,18 @@ +import openai + +class IntentService(): + def __init__(self): + pass + + def get_intent(self, user_question: str): + # call the openai ChatCompletion endpoint + response = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "user", "content": f'Extract the keywords from the following question: {user_question}'+ + 'Do not answer anything else, only the keywords.'} + ] + ) + + # extract the response + return (response['choices'][0]['message']['content']) \ No newline at end of file diff --git a/scripts/question_answering_on_pdf/responseservice.py b/scripts/question_answering_on_pdf/responseservice.py new file mode 100644 index 0000000..a50b14f --- /dev/null +++ b/scripts/question_answering_on_pdf/responseservice.py @@ -0,0 +1,18 @@ +import openai + +class ResponseService(): + def __init__(self): + pass + + def generate_response(self, facts, user_question): + # call the openai ChatCompletion endpoint + response = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "user", "content": 'Based on the FACTS, give an answer to the QUESTION.'+ + f'QUESTION: {user_question}. FACTS: {facts}'} + ] + ) + + # extract the response + return (response['choices'][0]['message']['content']) \ No newline at end of file diff --git a/scripts/question_answering_on_pdf/run.py b/scripts/question_answering_on_pdf/run.py new file mode 100644 index 0000000..ee9da44 --- /dev/null +++ b/scripts/question_answering_on_pdf/run.py @@ -0,0 +1,32 @@ +from dotenv import load_dotenv +load_dotenv() + +from scripts.question_answering_on_pdf.intentservice import IntentService +from scripts.question_answering_on_pdf.responseservice import ResponseService +from scripts.question_answering_on_pdf.dataservice import DataService + +# Example pdf +pdf = 'data/pdf-example.pdf' + +data_service = DataService() + +# Drop all data from redis if needed +data_service.drop_redis_data() + +# Load data from pdf to redis +data = data_service.pdf_to_embeddings(pdf) + +data_service.load_data_to_redis(data) + +intent_service = IntentService() +response_service = ResponseService() + +# Question +question = "Can you explain to me the summary of the paper?" +# Get the intent +intents = intent_service.get_intent(question) +# Get the facts +facts = data_service.search_redis(intents) +# Get the answer +answer = response_service.generate_response(facts, question) +print(answer) \ No newline at end of file From d0cfba1a047ed5ce1b19a1b6dc6393d3d6415335 Mon Sep 17 00:00:00 2001 From: Alvaro Ortiz Date: Fri, 12 Jan 2024 19:05:40 +0000 Subject: [PATCH 6/7] Add packages to use ai_teacher --- pyproject.toml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0a9c24f..d24ab3a 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ license = "Propietary" keywords = ["cookiecutter"] -repository = "" +repository = "https://github.com/" readme = ["README.md", "LICENSE", "RELEASE_NOTES.md"] @@ -56,6 +56,10 @@ radon = "5.1.0" semver = "2.13.0" snakeviz = "2.1.1" openai = ">1" +python-docx = "0.8.11" +python-pptx = "0.6.21" +PyPDF2 = "1.28.6" +fpdf = "1.7.2" [tool.poetry.group.docs.dependencies] furo = "^2022.12.7" From 390bf2770c2bc44084718d42ae63dd98ebd3f438 Mon Sep 17 00:00:00 2001 From: Alvaro Ortiz Date: Fri, 12 Jan 2024 19:06:11 +0000 Subject: [PATCH 7/7] AI teacher: add complete MVP --- ai_teacher/exam_generator/__init__.py | 0 ai_teacher/exam_generator/exam_creator.py | 46 +++++++++++++++++++++ ai_teacher/file_processor/__init__.py | 0 ai_teacher/file_processor/docx_processor.py | 17 ++++++++ ai_teacher/file_processor/pdf_processor.py | 20 +++++++++ ai_teacher/file_processor/ppt_processor.py | 21 ++++++++++ ai_teacher/main.py | 44 ++++++++++++++++++++ ai_teacher/pdf_creator/__init__.py | 0 ai_teacher/pdf_creator/pdf_exporter.py | 21 ++++++++++ ai_teacher/utils/__init__.py | 0 ai_teacher/utils/file_uploader.py | 24 +++++++++++ 11 files changed, 193 insertions(+) create mode 100644 ai_teacher/exam_generator/__init__.py create mode 100644 ai_teacher/exam_generator/exam_creator.py create mode 100644 ai_teacher/file_processor/__init__.py create mode 100644 ai_teacher/file_processor/docx_processor.py create mode 100644 ai_teacher/file_processor/pdf_processor.py create mode 100644 ai_teacher/file_processor/ppt_processor.py create mode 100644 ai_teacher/main.py create mode 100644 ai_teacher/pdf_creator/__init__.py create mode 100644 ai_teacher/pdf_creator/pdf_exporter.py create mode 100644 ai_teacher/utils/__init__.py create mode 100644 ai_teacher/utils/file_uploader.py diff --git a/ai_teacher/exam_generator/__init__.py b/ai_teacher/exam_generator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ai_teacher/exam_generator/exam_creator.py b/ai_teacher/exam_generator/exam_creator.py new file mode 100644 index 0000000..d6bb3fb --- /dev/null +++ b/ai_teacher/exam_generator/exam_creator.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +import os + +from dotenv import load_dotenv + +load_dotenv() + +from openai import OpenAI + + +class ExamCreator: + def __init__(self): + self.client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) + + def generate_questions(self, content): + """Generates multiple-choice questions based on the provided content.""" + try: + response = self.client.chat.completions.create( + model="gpt-3.5-turbo-1106", + response_format={"type": "json_object"}, + messages=[ + { + "role": "system", + "content": "You are a helpful assistant designed to output a multiple choice exam as a JSON.", + }, + {"role": "user", "content": self._create_prompt(content)}, + ], + ) + return response.choices[0].message.content + except Exception as e: + print(f"Error in generating questions: {e}") + return None + + @staticmethod + def _create_prompt(content): + """Creates a prompt for the OpenAI API based on the content.""" + prompt = f"Create 5 multiple-choice questions based on content between the content tags: \n\n{content}\n\n" + return prompt + + +# Test the exam creator (you can remove this test in your actual application) +if __name__ == "__main__": + exam_creator = ExamCreator() + sample_content = "Your sample text content here." + questions = exam_creator.generate_questions(sample_content) + print(questions) diff --git a/ai_teacher/file_processor/__init__.py b/ai_teacher/file_processor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ai_teacher/file_processor/docx_processor.py b/ai_teacher/file_processor/docx_processor.py new file mode 100644 index 0000000..8bafe89 --- /dev/null +++ b/ai_teacher/file_processor/docx_processor.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +import docx + + +class DOCXProcessor: + @staticmethod + def process_docx(file_path): + """Processes the Word file and extracts its content.""" + doc = docx.Document(file_path) + content = [paragraph.text for paragraph in doc.paragraphs] + return " ".join(content) + + +# Test the processor (you can remove this test in your actual application) +if __name__ == "__main__": + docx_content = DOCXProcessor.process_docx("path_to_your_docx.docx") + print(docx_content) diff --git a/ai_teacher/file_processor/pdf_processor.py b/ai_teacher/file_processor/pdf_processor.py new file mode 100644 index 0000000..8c738d6 --- /dev/null +++ b/ai_teacher/file_processor/pdf_processor.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +import PyPDF2 + + +class PDFProcessor: + @staticmethod + def process_pdf(file_path): + """Processes the PDF file and extracts its content.""" + with open(file_path, "rb") as file: + pdf_reader = PyPDF2.PdfFileReader(file) + content = [] + for page in range(pdf_reader.numPages): + content.append(pdf_reader.getPage(page).extractText()) + return " ".join(content) + + +# Test the processor (you can remove this test in your actual application) +if __name__ == "__main__": + pdf_content = PDFProcessor.process_pdf("path_to_your_pdf.pdf") + print(pdf_content) diff --git a/ai_teacher/file_processor/ppt_processor.py b/ai_teacher/file_processor/ppt_processor.py new file mode 100644 index 0000000..92ff35c --- /dev/null +++ b/ai_teacher/file_processor/ppt_processor.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +from pptx import Presentation + + +class PPTProcessor: + @staticmethod + def process_ppt(file_path): + """Processes the PowerPoint file and extracts text from each slide.""" + prs = Presentation(file_path) + content = [] + for slide in prs.slides: + for shape in slide.shapes: + if hasattr(shape, "text"): + content.append(shape.text) + return " ".join(content) + + +# Test the processor (you can remove this test in your actual application) +if __name__ == "__main__": + ppt_content = PPTProcessor.process_ppt("path_to_your_ppt.pptx") + print(ppt_content) diff --git a/ai_teacher/main.py b/ai_teacher/main.py new file mode 100644 index 0000000..b7758e4 --- /dev/null +++ b/ai_teacher/main.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +from exam_generator.exam_creator import ExamCreator +from file_processor.docx_processor import DOCXProcessor +from file_processor.pdf_processor import PDFProcessor +from file_processor.ppt_processor import PPTProcessor +from pdf_creator.pdf_exporter import PDFExporter +from utils.file_uploader import FileUploader + + +def process_file(file_path): + """Determines the file type and processes it using the appropriate processor.""" + if file_path.endswith(".pdf"): + return PDFProcessor.process_pdf(file_path) + elif file_path.endswith(".pptx"): + return PPTProcessor.process_ppt(file_path) + elif file_path.endswith(".docx"): + return DOCXProcessor.process_docx(file_path) + else: + print("Unsupported file format.") + return None + + +def main(): + print("Welcome to the Exam Generator!") + file_path = FileUploader.upload_file() + content = process_file(file_path) + + if content: + print("File processed successfully.") + exam_creator = ExamCreator() + exam_questions = exam_creator.generate_questions(content) + if exam_questions: + print("Exam generated successfully.") + pdf_exporter = PDFExporter() + pdf_exporter.create_pdf(exam_questions, "generated_exam.pdf") + print("Exam exported as PDF successfully.") + else: + print("Failed to generate exam.") + else: + print("Failed to process the file.") + + +if __name__ == "__main__": + main() diff --git a/ai_teacher/pdf_creator/__init__.py b/ai_teacher/pdf_creator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ai_teacher/pdf_creator/pdf_exporter.py b/ai_teacher/pdf_creator/pdf_exporter.py new file mode 100644 index 0000000..c5bf6ca --- /dev/null +++ b/ai_teacher/pdf_creator/pdf_exporter.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +from fpdf import FPDF + + +class PDFExporter: + def __init__(self): + self.pdf = FPDF() + + def create_pdf(self, content, file_name="exam.pdf"): + """Creates a PDF file from the provided content.""" + self.pdf.add_page() + self.pdf.set_font("Arial", size=12) + self.pdf.multi_cell(0, 10, content) + self.pdf.output(file_name) + + +# Test the PDF exporter (you can remove this test in your actual application) +if __name__ == "__main__": + exporter = PDFExporter() + sample_content = "Your sample exam content here." + exporter.create_pdf(sample_content, "sample_exam.pdf") diff --git a/ai_teacher/utils/__init__.py b/ai_teacher/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ai_teacher/utils/file_uploader.py b/ai_teacher/utils/file_uploader.py new file mode 100644 index 0000000..26d5487 --- /dev/null +++ b/ai_teacher/utils/file_uploader.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +import os + + +class FileUploader: + @staticmethod + def upload_file(): + """Prompts the user to enter the file path and checks if the file exists. + + Returns the path if the file exists, otherwise prompts again. + """ + while True: + file_path = input("Enter the path of your file: ") + if os.path.isfile(file_path): + return file_path + else: + print("File not found. Please try again.") + + +# Test the uploader (you can remove this test in your actual application) +if __name__ == "__main__": + uploader = FileUploader() + file_path = uploader.upload_file() + print(f"File {file_path} uploaded successfully.")