SVM In MNIST

SVM In MNIST

算法原理简述:

算法中的主要证明在此处不做赘述;主要是介绍代码和与之对应的算法原理。首先,我们是从所有的alpha中调整了其顺序,先遍历那些符合0<alpha<c的数据;然后根据偏离kkt条件的程度,选择出alpha1和alpha2;然后我们根据选择出的alpha1和alpha2计算出alpha1的上下界;计算在未约束的情况下alpha1的最优值;根据和上下界的关系确定新的alpha1值;根据alpha1值确定alpha2值;接下来我们根据新的alpha1和alpha2更新对应的b值和对应的E函数值;至此一轮的迭代结束。
算法中的主要思路是我们有原始问题;但是原始问题相对难以求解,所以选择对偶问题来突破。得到对偶问题的解我们就可以逆向求解得到原始问题的解。对偶问题涉及到多个变量的二次规划问题;所以我们选择每次选择两个变量来不断逼近最优解。选择变量的方式就是找到最不符合KKT条件的第一个变量;然后根据第一个变量找到|E1 - E2|最大的那个第二个变量即可。

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : SVM.py
# @Author: mfzhu
# @Date : 2017/5/10
# @Desc :
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
def binary(data, threshold):
"""
:param data: 需要二值化的数据
:param threshold: 二值化的阈值
:return: 二值化后的数据
"""
index_1 = np.array(data > threshold)
index_2 = np.array(data <= threshold)
data[index_1] = 1
data[index_2] = 0
return data
class svm(object):
def __init__(self, data_in, label_in, penalty_coefficient, epsilon):
self.data = data_in
# 初始化训练数据
self.label = label_in
# 初始化标签数据
self.epsilon = epsilon
# 确定阈值
self.penalty_coefficient = penalty_coefficient
# 确定惩罚系数
self.row = data_in.shape[0]
# 数据行数
self.column = data_in.shape[1]
# 数据维数
self.alpha = np.zeros([1, self.row], float)[0]
# 初始化待训练系数
self.b = 0.0
# 初始化偏移量
self.E = [self.e_func(i) for i in range(self.row)]
# 计算每个样本的E函数值
self.iteration = 5000
# 确定迭代次数
def g_func(self, index):
# 计算第index个数据的G函数值
res = self.b
for i in range(self.row):
res += self.alpha[i] * self.label[i] * np.dot(self.data[index], self.data[i])
return res
def e_func(self, index):
# 计算第index个数据的E函数值
return self.g_func(index) - self.label[index]
def satisfy_KKT(self, index):
# 判断第index个样本是否符合KKT条件
judgeNum = self.g_func(index) * self.label[index]
if abs(self.alpha[index]) < self.epsilon:
return judgeNum > 1 or judgeNum == 1
elif abs(self.alpha[index] - self.penalty_coefficient) < self.epsilon:
return judgeNum < 1 or judgeNum == 1
else:
return abs(judgeNum - 1) < self.epsilon
# def is_stop(self):
# # 确定是否当前符合所有样本符合KKT条件
# for index in range(self.row):
# if not self.satisfy_KKT(index):
# return False
# return True
# def kernel(self):
#
# if self.kernel == "liner":
# pass
# if self.kernel == "poly":
# return np.sum(np.dot(x1, x2))**3
def select_alpha1_alpha2(self):
# 选择出两个待迭代的系数
index_list = [i for i in range(self.row)]
front_list = list(filter(lambda i: 0 < self.alpha[i] < self.penalty_coefficient, index_list))
back_list = list(set(index_list) - set(front_list))
index = front_list
index.extend(back_list)
# 此处主要是重新调整了索引的顺序,先遍历的部分是满足(0, C)的索引
for i in index:
if self.satisfy_KKT(i):
continue
E_1 = self.E[i]
max_difference = [0, 0]
# 保存的是另外一个索引和其E函数值和第一个索引的E函数值的绝对值差
for j in index_list:
if j == i:
continue
E_2 = self.E[j]
if abs(E_1 - E_2) > max_difference[0]:
max_difference[0] = abs(E_1 - E_2)
max_difference[1] = j
# 找到E函数值差最大的两个索引并返回
return i, max_difference[1]
def K_fun(self, index1, index2):
# 计算两个样本之间的k函数值
return np.dot(self.data[index1], self.data[index2])
def train(self):
# 训练函数
for time in range(self.iteration):
index1, index2 = self.select_alpha1_alpha2()
# 选择出两个待训练的索引
if self.label[index1] == self.label[index2]:
# 如果两个索引对应的标签同号的情况
# 确定alpha2的上下界
lowBound = max(0, self.alpha[index1] + self.alpha[index2] - self.penalty_coefficient)
highBound = min(self.penalty_coefficient, self.alpha[index1] + self.alpha[index2])
else:
# 异号的情况
# 确定alpha2的上下界
lowBound = max(0, self.alpha[index2] - self.alpha[index1])
highBound = min(self.penalty_coefficient, self.penalty_coefficient +
self.alpha[index2] - self.alpha[index1])
eta = self.K_fun(index1, index1) + self.K_fun(index2, index2) - 2 * self.K_fun(index1, index2)
# 计算eta值
alpha1_old = self.alpha[index1]
alpha2_old = self.alpha[index2]
# 记录下alpha1和alpha2的原始值
alpha2_unc = self.alpha[index2] + self.label[index2] * (self.E[index1] - self.E[index2]) / eta
# 计算未约束的情况下的alpha2的结果
if alpha2_unc > highBound:
alpha2_new = highBound
elif alpha2_unc < lowBound:
alpha2_new = lowBound
else:
alpha2_new = alpha2_unc
# 判断未约束值与上下界的关系并更新alpha2的值
alpha1_new = alpha1_old + (self.label[index1] * self.label[index2]
* (alpha2_old - alpha2_new))
# 更新alpha1的值
b_new = 0
b1_new = -self.E[index1] - self.label[index1] * self.K_fun(index1, index1) * (
alpha1_new - alpha1_old) - self.label[index2] * self.K_fun(
index2, index1) * (alpha2_new - alpha2_old) + self.b
b2_new = -self.E[index2] - self.label[index2] * self.K_fun(index2, index2) * (
alpha2_new - alpha2_old) - self.label[index1] * self.K_fun(
index1, index2) * (alpha1_new - alpha1_old) + self.b
# 更新alpha1和alpha2对应的b值
if 0 < alpha1_new < self.penalty_coefficient:
b_new = b1_new
elif 0 < alpha2_new < self.penalty_coefficient:
b_new = b2_new
else:
b_new = (b1_new + b2_new) / 2
self.b = b_new
# 更新b值
self.alpha[index1] = alpha1_new
self.alpha[index2] = alpha2_new
# 更新alpha值
self.E[index1] = self.e_func(index1)
self.E[index2] = self.e_func(index2)
# 更新E值
def _predict_(self, feature):
# 预测输入向量的标签
result = self.b
for i in range(self.row):
result += self.alpha[i] * self.label[i] * np.dot(self.data[i], feature)
if result > 0:
return 1
else:
return -1
def predict(self, features):
# 预测多个输入的标签
result = []
for i in range(len(features)):
result.append(self._predict_(features[i]))
return result
if __name__ == '__main__':
time1 = time.time()
print("start reading the data:")
raw_path = r'F:\work and learn\ML\dataset\MNIST\train.csv'
raw_data = np.loadtxt(raw_path, delimiter=',', skiprows=1)
# 读入原始数据
label = raw_data[:, 0]
data = raw_data[:, 1:]
# 区分标签数据和原始数据
data = binary(data, 100)
label[label != 1] = -1
# 进行二值化和对标签二值化(这边做的是二分类)
train_data, test_data, train_label, test_label = train_test_split(data, label, test_size=0.33, random_state=23323)
time2 = time.time()
print("read data cost:", time2 - time1, " seconds", '\n')
# 将训练数据切分成三块用于训练和测试
print("start training the model:")
clf = svm(train_data, train_label, 1000, 0.001)
clf.train()
time3 = time.time()
print("training cost:", time3 - time2, ' seconds', '\n')
print("start predicting:")
# 定义svm类然后训练
test_predict = clf.predict(test_data)
print(test_predict)
score = accuracy_score(test_label, test_predict)
time4 = time.time()
print("predicting cost:", time4 - time3, ' seconds', '\n')
# 利用训练到的模型预测测试集的标签
print("the accuracy_score is:", score)

代码运行结果:

从运行结果可以知道,训练的时间较长,且预测的正确率较低。通过与sklearn的包的内置函数比较,正确率和训练时间等方面都较差于sklearn的svm。
img