forked from Unstructured-IO/unstructured
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext.py
More file actions
79 lines (65 loc) · 2.29 KB
/
text.py
File metadata and controls
79 lines (65 loc) · 2.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import re
from typing import IO, List, Optional
from unstructured.cleaners.core import clean_bullets
from unstructured.documents.elements import (
Address,
Element,
ElementMetadata,
ListItem,
NarrativeText,
Text,
Title,
)
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
from unstructured.partition.text_type import (
is_bulleted_text,
is_possible_narrative_text,
is_possible_title,
is_us_city_state_zip,
)
def split_by_paragraph(content: str) -> List[str]:
return re.split(PARAGRAPH_PATTERN, content)
def partition_text(
filename: Optional[str] = None,
file: Optional[IO] = None,
text: Optional[str] = None,
) -> List[Element]:
"""Partitions an .txt documents into its constituent elements.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "r" mode --> open(filename, "r").
text
The string representation of the .txt document.
"""
if not any([filename, file, text]):
raise ValueError("One of filename, file, or text must be specified.")
if filename is not None and not file and not text:
with open(filename, encoding="utf8") as f:
file_text = f.read()
elif file is not None and not filename and not text:
file_text = file.read()
elif text is not None and not filename and not file:
file_text = str(text)
else:
raise ValueError("Only one of filename, file, or text can be specified.")
file_content = split_by_paragraph(file_text)
elements: List[Element] = []
metadata = ElementMetadata(filename=filename)
for ctext in file_content:
ctext = ctext.strip()
if ctext == "":
continue
if is_bulleted_text(ctext):
elements.append(ListItem(text=clean_bullets(ctext), metadata=metadata))
elif is_us_city_state_zip(ctext):
elements.append(Address(text=ctext, metadata=metadata))
elif is_possible_narrative_text(ctext):
elements.append(NarrativeText(text=ctext, metadata=metadata))
elif is_possible_title(ctext):
elements.append(Title(text=ctext, metadata=metadata))
else:
elements.append(Text(text=ctext, metadata=metadata))
return elements