forked from Unstructured-IO/unstructured
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocx.py
More file actions
135 lines (116 loc) · 4.09 KB
/
docx.py
File metadata and controls
135 lines (116 loc) · 4.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from typing import IO, List, Optional
import docx
from unstructured.cleaners.core import clean_bullets
from unstructured.documents.elements import (
Address,
Element,
ElementMetadata,
ListItem,
NarrativeText,
Text,
Title,
)
from unstructured.partition.text_type import (
is_bulleted_text,
is_possible_narrative_text,
is_possible_title,
is_us_city_state_zip,
)
# NOTE(robinson) - documentation on built in styles can be found at the link below
# ref: https://python-docx.readthedocs.io/en/latest/user/
# styles-understanding.html#paragraph-styles-in-default-template
STYLE_TO_ELEMENT_MAPPING = {
"Caption": Text, # TODO(robinson) - add caption element type
"Heading 1": Title,
"Heading 2": Title,
"Heading 3": Title,
"Heading 4": Title,
"Heading 5": Title,
"Heading 6": Title,
"Heading 7": Title,
"Heading 8": Title,
"Heading 9": Title,
"Intense Quote": Text, # TODO(robinson) - add quote element type
"List": ListItem,
"List 2": ListItem,
"List 3": ListItem,
"List Bullet": ListItem,
"List Bullet 2": ListItem,
"List Bullet 3": ListItem,
"List Continue": ListItem,
"List Continue 2": ListItem,
"List Continue 3": ListItem,
"List Number": ListItem,
"List Number 2": ListItem,
"List Number 3": ListItem,
"List Paragraph": ListItem,
"Macro Text": Text,
"No Spacing": Text,
"Quote": Text, # TODO(robinson) - add quote element type
"Subtitle": Title,
"TOCHeading": Title,
"Title": Title,
}
def partition_docx(
filename: Optional[str] = None,
file: Optional[IO] = None,
metadata_filename: Optional[str] = None,
) -> List[Element]:
"""Partitions Microsoft Word Documents in .docx format into its document elements.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
metadata_filename
The filename to use for the metadata. Relevant because partition_doc converts the
document to .docx before partition. We want the original source filename in the
metadata.
"""
if not any([filename, file]):
raise ValueError("One of filename or file must be specified.")
if filename is not None and not file:
document = docx.Document(filename)
elif file is not None and not filename:
document = docx.Document(file)
else:
raise ValueError("Only one of filename or file can be specified.")
metadata_filename = metadata_filename or filename
elements: List[Element] = []
for paragraph in document.paragraphs:
element = _paragraph_to_element(paragraph)
if element is not None:
element.metadata = ElementMetadata(filename=metadata_filename)
elements.append(element)
return elements
def _paragraph_to_element(paragraph: docx.text.paragraph.Paragraph) -> Optional[Text]:
"""Converts a docx Paragraph object into the appropriate unstructured document element.
If the paragraph style is "Normal" or unknown, we try to predict the element type from the
raw text."""
text = paragraph.text
style_name = paragraph.style.name
if len(text.strip()) == 0:
return None
element_class = STYLE_TO_ELEMENT_MAPPING.get(style_name)
# NOTE(robinson) - The "Normal" style name will return None since it's in the mapping.
# Unknown style names will also return None
if element_class is None:
return _text_to_element(text)
else:
return element_class(text)
def _text_to_element(text: str) -> Optional[Text]:
"""Converts raw text into an unstructured Text element."""
if is_bulleted_text(text):
clean_text = clean_bullets(text).strip()
return ListItem(text=clean_bullets(text)) if clean_text else None
elif is_us_city_state_zip(text):
return Address(text=text)
if len(text) < 2:
return None
elif is_possible_narrative_text(text):
return NarrativeText(text)
elif is_possible_title(text):
return Title(text)
else:
return Text(text)