-
-
Notifications
You must be signed in to change notification settings - Fork 146
Expand file tree
/
Copy pathsuggestion_list.py
More file actions
114 lines (83 loc) · 3.54 KB
/
suggestion_list.py
File metadata and controls
114 lines (83 loc) · 3.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""List with suggestions"""
from __future__ import annotations
from typing import TYPE_CHECKING
from .natural_compare import natural_comparison_key
if TYPE_CHECKING:
from collections.abc import Collection
__all__ = ["suggestion_list"]
def suggestion_list(input_: str, options: Collection[str]) -> list[str]:
"""Get list with suggestions for a given input.
Given an invalid input string and list of valid options, returns a filtered list
of valid options sorted based on their similarity with the input.
"""
options_by_distance = {}
lexical_distance = LexicalDistance(input_)
threshold = int(len(input_) * 0.4) + 1
for option in options:
distance = lexical_distance.measure(option, threshold)
if distance is not None:
options_by_distance[option] = distance
return sorted(
options_by_distance,
key=lambda option: (
options_by_distance.get(option, 0),
natural_comparison_key(option),
),
)
class LexicalDistance:
"""Computes the lexical distance between strings A and B.
The "distance" between two strings is given by counting the minimum number of edits
needed to transform string A into string B. An edit can be an insertion, deletion,
or substitution of a single character, or a swap of two adjacent characters.
This distance can be useful for detecting typos in input or sorting.
"""
_input: str
_input_lower_case: str
_input_list: list[int]
_rows: list[list[int]]
def __init__(self, input_: str) -> None:
self._input = input_
self._input_lower_case = input_.lower()
row_size = len(input_) + 1
self._input_list = list(map(ord, self._input_lower_case))
self._rows = [[0] * row_size, [0] * row_size, [0] * row_size]
def measure(self, option: str, threshold: int) -> int | None:
if self._input == option:
return 0
option_lower_case = option.lower()
# Any case change counts as a single edit
if self._input_lower_case == option_lower_case:
return 1
a, b = list(map(ord, option_lower_case)), self._input_list
a_len, b_len = len(a), len(b)
if a_len < b_len:
a, b = b, a
a_len, b_len = b_len, a_len
if a_len - b_len > threshold:
return None
rows = self._rows
for j in range(b_len + 1):
rows[0][j] = j
for i in range(1, a_len + 1):
up_row = rows[(i - 1) % 3]
current_row = rows[i % 3]
smallest_cell = current_row[0] = i
for j in range(1, b_len + 1):
cost = 0 if a[i - 1] == b[j - 1] else 1
current_cell = min(
up_row[j] + 1, # delete
current_row[j - 1] + 1, # insert
up_row[j - 1] + cost, # substitute
)
if i > 1 and j > 1 and a[i - 1] == b[j - 2] and a[i - 2] == b[j - 1]:
# transposition
double_diagonal_cell = rows[(i - 2) % 3][j - 2]
current_cell = min(current_cell, double_diagonal_cell + 1)
smallest_cell = min(current_cell, smallest_cell)
current_row[j] = current_cell
# Early exit, since distance can't go smaller than smallest element
# of the previous row.
if smallest_cell > threshold:
return None
distance = rows[a_len % 3][b_len]
return distance if distance <= threshold else None