-
Notifications
You must be signed in to change notification settings - Fork 78
Expand file tree
/
Copy pathnumpy_processors.py
More file actions
123 lines (92 loc) ยท 4.37 KB
/
Copy pathnumpy_processors.py
File metadata and controls
123 lines (92 loc) ยท 4.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from os import PathLike
from typing import Dict, List, Union, Sequence
import json
from PIL.Image import Image, BICUBIC
from tokenizers import Tokenizer
import numpy as np
from uform.shared import read_config
class TextProcessor:
def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
"""
:param config: model config
:param tokenizer_path: path to tokenizer file
"""
config = read_config(config_path)
if "text_encoder" in config:
config = config["text_encoder"]
self._max_seq_len = config["max_position_embeddings"]
self._tokenizer = Tokenizer.from_file(tokenizer_path)
self._tokenizer.no_padding()
self._pad_token_idx = config["padding_idx"]
def __call__(self, texts: Union[str, Sequence[str]]) -> Dict[str, np.ndarray]:
"""Transforms one or more strings into dictionary with tokenized strings and attention masks.
:param texts: text of list of texts to tokenizer
"""
if isinstance(texts, str):
texts = [texts]
input_ids = np.full(
(len(texts), self._max_seq_len),
fill_value=self._pad_token_idx,
dtype=np.int32,
)
attention_mask = np.zeros(
(len(texts), self._max_seq_len),
dtype=np.int32,
)
encoded = self._tokenizer.encode_batch(texts)
for i, seq in enumerate(encoded):
seq_len = min(len(seq), self._max_seq_len)
input_ids[i, :seq_len] = seq.ids[:seq_len]
attention_mask[i, :seq_len] = 1
return {"input_ids": input_ids, "attention_mask": attention_mask}
class ImageProcessor:
def __init__(self, config_path: PathLike, tokenizer_path: PathLike = None):
"""
:param config: model config
:param tokenizer_path: path to tokenizer file
:param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
"""
config = read_config(config_path)
if "image_encoder" in config:
config = config["image_encoder"]
self._image_size = config["image_size"]
self._normalization_means = config["normalization_means"]
self._normalization_deviations = config["normalization_deviations"]
assert isinstance(self._image_size, int) and self._image_size > 0
assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list)
assert len(self._normalization_means) == len(self._normalization_deviations) == 3
self.image_mean = np.array(self._normalization_means, dtype=np.float32)[None, None]
self.image_std = np.array(self._normalization_deviations, dtype=np.float32)[None, None]
def __call__(self, images: Union[Image, Sequence[Image]]) -> np.ndarray:
"""Transforms one or more Pillow images into Torch Tensors.
:param images: image or list of images to preprocess
"""
if isinstance(images, Sequence):
batch_images = np.empty(
(len(images), 3, self._image_size, self._image_size),
dtype=np.float32,
)
for i, image in enumerate(images):
batch_images[i] = self._resize_crop_normalize(image)
else:
batch_images = self._resize_crop_normalize(images)[None]
return batch_images
def _resize_crop_normalize(self, image: Image):
width, height = image.size
if width < height:
width = self._image_size
height = int(height / width * self._image_size)
else:
width = int(width / height * self._image_size)
height = self._image_size
image = image.resize((width, height), resample=BICUBIC)
left = (width - self._image_size) / 2
top = (height - self._image_size) / 2
right = (width + self._image_size) / 2
bottom = (height + self._image_size) / 2
image = image.convert("RGB").crop((left, top, right, bottom))
# At this point `image` is a PIL Image with RGB channels.
# If you convert it to `np.ndarray` it will have shape (H, W, C) where C is the number of channels.
image = (np.array(image).astype(np.float32) / 255.0 - self.image_mean) / self.image_std
# To make it compatible with PyTorch, we need to transpose the image to (C, H, W).
return np.transpose(image, (2, 0, 1))