forked from mistralai/client-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathocr_process_from_file.py
More file actions
44 lines (34 loc) · 1.09 KB
/
ocr_process_from_file.py
File metadata and controls
44 lines (34 loc) · 1.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from mistralai import Mistral
import os
import json
from pathlib import Path
import urllib.request
MIXTRAL_OF_EXPERTS_PDF_URL = "https://arxiv.org/pdf/2401.04088"
MOE_FILENAME = "mixtral_of_experts.pdf"
def main():
api_key = os.environ["MISTRAL_API_KEY"]
client = Mistral(api_key=api_key)
pdf_file = Path(MOE_FILENAME)
# Download the file if it doesn't exist
if not pdf_file.is_file():
urllib.request.urlretrieve(MIXTRAL_OF_EXPERTS_PDF_URL, MOE_FILENAME)
# Upload the file
uploaded_file = client.files.upload(
file={
"file_name": pdf_file.stem,
"content": pdf_file.read_bytes(),
},
purpose="ocr",
)
pdf_response = client.ocr.process(document={
"type": "file",
"file_id": uploaded_file.id,
}, model="mistral-ocr-latest", include_image_base64=True)
# Print the parsed PDF
response_dict = json.loads(pdf_response.model_dump_json())
json_string = json.dumps(response_dict, indent=4)
print(json_string)
# Remove the file
pdf_file.unlink()
if __name__ == "__main__":
main()