-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcode.py
More file actions
executable file
·149 lines (116 loc) · 4.16 KB
/
code.py
File metadata and controls
executable file
·149 lines (116 loc) · 4.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# -*- coding: utf-8 -*-
"""
Created on Sun Oct 15 19:42:13 2017
@author: wnma3
"""
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pywt
from pandas import DataFrame, Series
from scipy.interpolate import lagrange
from scipy.io import loadmat # mat是MATLAB的专用格式,调用loadmat方法读取
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
"""
代码说明:
ployinterp_column-->拉格朗日填充数值
programmer_1-->筛选异常数据(包括NaN)进行填充
programmer_2-->最小-最大规范化、零-均值规范化、小数定标规范化
programmer_4-->基本的dataframe操作
programmer_5-->利用小波分析(???)进行特征分析
programmer_6-->利用PCA计算特征向量,用于降维分析
"""
path = os.getcwd()
def programmer_1():
inputfile = path + '/data/catering_sale.xls'
outputfile = path + '/tmp/sales.xls'
data = pd.read_excel(inputfile)
data[(data[u'销量'] < 400) | (data[u'销量'] > 5000)] = None
def ployinterp_column(index, df, k=5):
y = df[list(range(index - k, index))
+ list(range(index + 1, index + 1 + k))]
y = y[y.notnull()]
return lagrange(y.index, list(y))(index)
df = data[data[u'销量'].isnull()]
index_list = df[u'销量'].index
for index in index_list:
data[[u'销量']][index] = ployinterp_column(index, data[u'销量'])
data.to_excel(outputfile)
def programmer_2():
datafile = path + '/data/normalization_data.xls'
data = pd.read_excel(datafile, header=None)
print((data - data.min()) / (data.max() - data.min()))
print((data - data.mean()) / data.std())
print(data / 10**np.ceil(np.log10(data.abs().max())))
# 聚类画图
def programmer_3():
datafile = path + '/data/discretization_data.xls'
data = pd.read_excel(datafile)
data = data[u'肝气郁结证型系数'].copy()
k = 4
# 方法一, 直接对数组进行分类
d1 = pd.cut(data, k, labels=range(k))
# 方法二, 等频率离散化
w = [1.0 * i / k for i in range(k + 1)]
# percentiles表示特定百分位数,同四分位数
w = data.describe(percentiles=w)[4:4 + k + 1]
w[0] = w[0] * (1 - 1e-10)
d2 = pd.cut(data, w, labels=range(k))
# 方法三,使用Kmeans
kmodel = KMeans(n_clusters=k, n_jobs=4)
kmodel.fit(data.values.reshape(len(data), 1))
# 输出聚类中心,并且排序
c = DataFrame(kmodel.cluster_centers_).sort_values(0)
# 相邻两项求中点,作为边界点
w = DataFrame.rolling(c, 2).mean().iloc[1:]
# 加上首末边界点
w = [0] + list(w[0]) + [data.max()]
d3 = pd.cut(data, w, labels=range(k))
def cluster_plot(d, k):
plt.figure(figsize=(8, 3))
for j in range(0, k):
plt.plot(data[d == j], [j for i in d[d == j]], 'o')
plt.ylim(-0.5, k - 0.5)
return plt
cluster_plot(d1, k).show()
cluster_plot(d2, k).show()
cluster_plot(d3, k).show()
def programmer_4():
inputfile = path + "/data/electricity_data.xls"
outputfile = path + "/tmp/electricity_data.xls"
data = pd.read_excel(inputfile)
data[u"线损率"] = (data[u"供入电量"] - data[u"供出电量"]) / data[u"供入电量"]
data.to_excel(outputfile, index=False)
def programmer_5():
inputfile = path + "/data/leleccum.mat"
mat = loadmat(inputfile)
signal = mat["leleccum"][0]
"""
处理数据
返回结果为level+1个数字
第一个数组为逼近系数数组
后面的依次是细节系数数组
"""
coeffs = pywt.wavedec(signal, "bior3.7", level=5)
print(coeffs)
def programmer_6():
inputfile = path + "/data/principal_component.xls"
outputfile = "/tmp/dimention_reducted.xls"
data = pd.read_excel(inputfile, header=None)
pca = PCA()
pca.fit(data)
# 返回各个模型的特征向量
pca.components_
# 返回各个成分各自的方差百分比
pca.explained_variance_ratio_
data.to_excel(outputfile, index=False)
if __name__ == '__main__':
# programmer_1()
# programmer_2()
# programmer_3()
# programmer_4()
# programmer_5()
# programmer_6()
pass