SVM In MNIST

算法原理简述：

算法中的主要证明在此处不做赘述；主要是介绍代码和与之对应的算法原理。首先，我们是从所有的alpha中调整了其顺序，先遍历那些符合0<alpha<c的数据；然后根据偏离kkt条件的程度，选择出alpha1和alpha2;然后我们根据选择出的alpha1和alpha2计算出alpha1的上下界；计算在未约束的情况下alpha1的最优值；根据和上下界的关系确定新的alpha1值；根据alpha1值确定alpha2值；接下来我们根据新的alpha1和alpha2更新对应的b值和对应的E函数值；至此一轮的迭代结束。
算法中的主要思路是我们有原始问题；但是原始问题相对难以求解，所以选择对偶问题来突破。得到对偶问题的解我们就可以逆向求解得到原始问题的解。对偶问题涉及到多个变量的二次规划问题；所以我们选择每次选择两个变量来不断逼近最优解。选择变量的方式就是找到最不符合KKT条件的第一个变量；然后根据第一个变量找到|E1 - E2|最大的那个第二个变量即可。

代码如下：

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File  : SVM.py
# @Author: mfzhu
# @Date  : 2017/5/10
# @Desc  :
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
def binary(data, threshold):
    """
    :param data: 需要二值化的数据
    :param threshold: 二值化的阈值
    :return: 二值化后的数据
    """
    index_1 = np.array(data > threshold)
    index_2 = np.array(data <= threshold)
    data[index_1] = 1
    data[index_2] = 0
    return data
class svm(object):
    def __init__(self, data_in, label_in, penalty_coefficient, epsilon):
        self.data = data_in
        # 初始化训练数据
        self.label = label_in
        # 初始化标签数据
        self.epsilon = epsilon
        # 确定阈值
        self.penalty_coefficient = penalty_coefficient
        # 确定惩罚系数
        self.row = data_in.shape[0]
        # 数据行数
        self.column = data_in.shape[1]
        # 数据维数
        self.alpha = np.zeros([1, self.row], float)[0]
        # 初始化待训练系数
        self.b = 0.0
        # 初始化偏移量
        self.E = [self.e_func(i) for i in range(self.row)]
        # 计算每个样本的E函数值
        self.iteration = 5000
        # 确定迭代次数
    def g_func(self, index):
        # 计算第index个数据的G函数值
        res = self.b
        for i in range(self.row):
            res += self.alpha[i] * self.label[i] * np.dot(self.data[index], self.data[i])
        return res
    def e_func(self, index):
        # 计算第index个数据的E函数值
        return self.g_func(index) - self.label[index]
    def satisfy_KKT(self, index):
        # 判断第index个样本是否符合KKT条件
        judgeNum = self.g_func(index) * self.label[index]
        if abs(self.alpha[index]) < self.epsilon:
            return judgeNum > 1 or judgeNum == 1
        elif abs(self.alpha[index] - self.penalty_coefficient) < self.epsilon:
            return judgeNum < 1 or judgeNum == 1
        else:
            return abs(judgeNum - 1) < self.epsilon
    # def is_stop(self):
    #     # 确定是否当前符合所有样本符合KKT条件
    #     for index in range(self.row):
    #         if not self.satisfy_KKT(index):
    #             return False
    #     return True
    # def kernel(self):
    #
    #     if self.kernel == "liner":
    #         pass
    #     if self.kernel == "poly":
    #         return np.sum(np.dot(x1, x2))**3
    def select_alpha1_alpha2(self):
        # 选择出两个待迭代的系数
        index_list = [i for i in range(self.row)]
        front_list = list(filter(lambda i: 0 < self.alpha[i] < self.penalty_coefficient, index_list))
        back_list = list(set(index_list) - set(front_list))
        index = front_list
        index.extend(back_list)
        # 此处主要是重新调整了索引的顺序，先遍历的部分是满足(0, C)的索引
        for i in index:
            if self.satisfy_KKT(i):
                continue
            E_1 = self.E[i]
            max_difference = [0, 0]
            # 保存的是另外一个索引和其E函数值和第一个索引的E函数值的绝对值差
            for j in index_list:
                if j == i:
                    continue
                E_2 = self.E[j]
                if abs(E_1 - E_2) > max_difference[0]:
                    max_difference[0] = abs(E_1 - E_2)
                    max_difference[1] = j
            # 找到E函数值差最大的两个索引并返回
            return i, max_difference[1]
    def K_fun(self, index1, index2):
        # 计算两个样本之间的k函数值
        return np.dot(self.data[index1], self.data[index2])
    def train(self):
        # 训练函数
        for time in range(self.iteration):
            index1, index2 = self.select_alpha1_alpha2()
            # 选择出两个待训练的索引
            if self.label[index1] == self.label[index2]:
                # 如果两个索引对应的标签同号的情况
                # 确定alpha2的上下界
                lowBound = max(0, self.alpha[index1] + self.alpha[index2] - self.penalty_coefficient)
                highBound = min(self.penalty_coefficient, self.alpha[index1] + self.alpha[index2])
            else:
                # 异号的情况
                # 确定alpha2的上下界
                lowBound = max(0, self.alpha[index2] - self.alpha[index1])
                highBound = min(self.penalty_coefficient, self.penalty_coefficient +
                                self.alpha[index2] - self.alpha[index1])
            eta = self.K_fun(index1, index1) + self.K_fun(index2, index2) - 2 * self.K_fun(index1, index2)
            # 计算eta值
            alpha1_old = self.alpha[index1]
            alpha2_old = self.alpha[index2]
            # 记录下alpha1和alpha2的原始值
            alpha2_unc = self.alpha[index2] + self.label[index2] * (self.E[index1] - self.E[index2]) / eta
            # 计算未约束的情况下的alpha2的结果
            if alpha2_unc > highBound:
                alpha2_new = highBound
            elif alpha2_unc < lowBound:
                alpha2_new = lowBound
            else:
                alpha2_new = alpha2_unc
            # 判断未约束值与上下界的关系并更新alpha2的值
            alpha1_new = alpha1_old + (self.label[index1] * self.label[index2]
                                       * (alpha2_old - alpha2_new))
            # 更新alpha1的值
            b_new = 0
            b1_new = -self.E[index1] - self.label[index1] * self.K_fun(index1, index1) * (
                alpha1_new - alpha1_old) - self.label[index2] * self.K_fun(
                index2, index1) * (alpha2_new - alpha2_old) + self.b
            b2_new = -self.E[index2] - self.label[index2] * self.K_fun(index2, index2) * (
                alpha2_new - alpha2_old) - self.label[index1] * self.K_fun(
                index1, index2) * (alpha1_new - alpha1_old) + self.b
            # 更新alpha1和alpha2对应的b值
            if 0 < alpha1_new < self.penalty_coefficient:
                b_new = b1_new
            elif 0 < alpha2_new < self.penalty_coefficient:
                b_new = b2_new
            else:
                b_new = (b1_new + b2_new) / 2
            self.b = b_new
            # 更新b值
            self.alpha[index1] = alpha1_new
            self.alpha[index2] = alpha2_new
            # 更新alpha值
            self.E[index1] = self.e_func(index1)
            self.E[index2] = self.e_func(index2)
            # 更新E值
    def _predict_(self, feature):
        # 预测输入向量的标签
        result = self.b
        for i in range(self.row):
            result += self.alpha[i] * self.label[i] * np.dot(self.data[i], feature)
        if result > 0:
            return 1
        else:
            return -1
    def predict(self, features):
        # 预测多个输入的标签
        result = []
        for i in range(len(features)):
            result.append(self._predict_(features[i]))
        return result
if __name__ == '__main__':
    time1 = time.time()
    print("start reading the data:")
    raw_path = r'F:\work and learn\ML\dataset\MNIST\train.csv'
    raw_data = np.loadtxt(raw_path, delimiter=',', skiprows=1)
    # 读入原始数据
    label = raw_data[:, 0]
    data = raw_data[:, 1:]
    # 区分标签数据和原始数据
    data = binary(data, 100)
    label[label != 1] = -1
    # 进行二值化和对标签二值化（这边做的是二分类）
    train_data, test_data, train_label, test_label = train_test_split(data, label, test_size=0.33, random_state=23323)
    time2 = time.time()
    print("read data cost:", time2 - time1, " seconds", '\n')
    # 将训练数据切分成三块用于训练和测试
    print("start training the model:")
    clf = svm(train_data, train_label, 1000, 0.001)
    clf.train()
    time3 = time.time()
    print("training cost:", time3 - time2, ' seconds', '\n')
    print("start predicting:")
    # 定义svm类然后训练
    test_predict = clf.predict(test_data)
    print(test_predict)
    score = accuracy_score(test_label, test_predict)
    time4 = time.time()
    print("predicting cost:", time4 - time3, ' seconds', '\n')
    # 利用训练到的模型预测测试集的标签
    print("the accuracy_score is:", score)

代码运行结果：

从运行结果可以知道，训练的时间较长，且预测的正确率较低。通过与sklearn的包的内置函数比较，正确率和训练时间等方面都较差于sklearn的svm。