-
Notifications
You must be signed in to change notification settings - Fork 179
Expand file tree
/
Copy pathcentroid.py
More file actions
102 lines (87 loc) · 3.6 KB
/
centroid.py
File metadata and controls
102 lines (87 loc) · 3.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# -*- coding: utf-8 -*-
#
# Copyright 2014-2025 BigML
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""Centroid structure for the BigML local Cluster
This module defines an auxiliary Centroid predicate structure that is used
in the cluster.
"""
import math
import sys
INDENT = " " * 4
STATISTIC_MEASURES = [
'Minimum', 'Mean', 'Median', 'Maximum', 'Standard deviation', 'Sum',
'Sum squares', 'Variance']
def cosine_distance2(terms, centroid_terms, scale):
"""Returns the distance defined by cosine similarity
"""
# Centroid values for the field can be an empty list.
# Then the distance for an empty input is 1
# (before applying the scale factor).
if not terms and not centroid_terms:
return 0
if not terms or not centroid_terms:
return scale ** 2
input_count = 0
for term in centroid_terms:
if term in terms:
input_count += 1
cosine_similarity = input_count / math.sqrt(
len(terms) * len(centroid_terms))
similarity_distance = scale * (1 - cosine_similarity)
return similarity_distance ** 2
class Centroid():
"""A Centroid.
"""
def __init__(self, centroid_info):
self.center = centroid_info.get('center', {})
self.count = centroid_info.get('count', 0)
self.centroid_id = centroid_info.get(
'id', centroid_info.get("centroid_id", None))
self.name = centroid_info.get('name', None)
self.distance = centroid_info.get('distance', {})
def distance2(self, input_data, term_sets, scales, stop_distance2=None):
"""Squared Distance from the given input data to the centroid
"""
distance2 = 0.0
for field_id, value in list(self.center.items()):
try:
if isinstance(value, list):
# text field
terms = ([] if field_id not in term_sets else
term_sets[field_id])
distance2 += cosine_distance2(terms, value, scales[field_id])
elif isinstance(value, str):
if field_id not in input_data or input_data[field_id] != value:
distance2 += 1 * scales[field_id] ** 2
else:
distance2 += ((input_data[field_id] - value) *
scales[field_id]) ** 2
if stop_distance2 is not None and distance2 >= stop_distance2:
return None
except:
raise ValueError("Error computing field id %s input %s value %s" %
(field_id, input_data[field_id], value))
return distance2
def print_statistics(self, out=sys.stdout):
"""Print the statistics for the training data clustered around the
centroid
"""
out.write("%s%s:\n" % (INDENT, self.name))
literal = "%s%s: %s\n"
for measure_title in STATISTIC_MEASURES:
measure = measure_title.lower().replace(" ", "_")
out.write(literal % (INDENT * 2, measure_title,
self.distance[measure]))
out.write("\n")