Adaboost In MNIST

adaboost模型简介

boosting(提升)方法是一种常用的统计学习方法。在分类问题中，它通过改变训练样本的权重，学习多个分类器，并将这些分类器进行线性组合，提高分类的性能。本文中所介绍的adaboost就是在boost中具有代表性的方法。
我们学习分类器的基本思想是：因为相对于强分类器，弱分类器更加容易发现；所以我们通过学习一系列的弱分类器，通过对弱分类器的线性组合构成一个强分类器。其实就是”三个臭皮匠顶个诸葛亮”的基本思想。

其中adaboost方法中的特点有：

我们通过上一轮学习到的弱分类器来对下一轮的数据集权重进行改变。当前学习的分类器的误分类误差率越大的其分类器权重越高，反之则不然。
在通过当前分类器更改数据集的权重的时候，误分类的数据样本权重增大，正确分类的样本权重降低。

代码如下：

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File  : adaBoost.py
# @Author: mfzhu
# @Date  : 2017/4/29
# @Desc  :
import numpy as np
import math
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
def binary(data, threshold):
    # 二值化函数
    index_1 = np.array(data > threshold)
    index_2 = np.array(data <= threshold)
    data[index_1] = 1
    data[index_2] = 0
    return data
class sign(object):
    # 阈值函数类
    def __init__(self, sample, label, weight):
        """
        :param sample: 输入一个样本找到最优的切分位置，样本为全体数据的某一列
        :param label:  这一列样本数据所对应的标签
        :param weight: 需要输入初始的权重来确定分类函数
        """
        self.sample = sample
        # 初始样本
        self.label = label
        # 初始标签
        self.weight = weight
        # 初始权重
        self.segment_value = -1
        # 分类函数的切分值
        self.less_index = [0, 1, 2]
        # 小于函数的切分位置集合
        self.more_index = [0, 1, 2]
        # 大于函数的切分位置集合
        self.is_less = False
        self.is_more = False
        # 标签用于表示是大于函数还是小于函数
    def train_less_than(self):
        # 训练小于函数
        less_seg_value = -1
        # 小于函数的切分值
        error_score = 100000
        # 初始化分类误差率
        for seg_value in self.less_index:
            # 遍历每个切分值找到最优的
            score = 0.0
            # 对于每个切分值保存分类误差率
            for index in range(len(self.sample)):
                val = -1
                if self.sample[index] < seg_value:
                    val = 1
                if val * self.label[index] < 0:
                    score += self.weight[index]
            if score < error_score:
                less_seg_value = seg_value
                error_score = score
                # 如果比当前的分类误差要小，更新切分值和分类误差率
        return less_seg_value, error_score
    def train_more_than(self):
        # 训练大于函数
        more_seg_value = -1
        # 大于函数的初始切分值
        error_score = 100000
        # 初始误差分类率
        for seg_value in self.more_index:
            # 对于每个切分值找到最优的切分
            score = 0.0
            # 保存每个切分的误差分类率
            for index in range(len(self.sample)):
                val = 1
                if self.sample[index] < seg_value:
                    val = -1
                if val * self.label[index] < 0:
                    score += self.weight[index]
            if score < error_score:
                more_seg_value = seg_value
                error_score = score
                # 保存最优的切分值和误差分类率
        return more_seg_value, error_score
    def train(self):
        less_seg, less_score = self.train_less_than()
        more_seg, more_score = self.train_more_than()
        # 分别训练大于和小于函数
        if less_score < more_score:
            self.is_less = True
            self.segment_value = less_seg
            return less_score
        else:
            self.is_more = True
            self.segment_value = more_seg
            return more_score
        # 选择误差分类率小的那个函数，并且标注分类函数是大于还是小于
    def predict(self, feature):
        if self.is_less:
            if feature < self.segment_value:
                return 1
            else:
                return -1
        # 确定用小于分类函数
        else:
            if feature < self.segment_value:
                return -1
            else:
                return 1
        # 确定用大于分类函数
class adaBoost(object):
    # adaboost类
    def __init__(self):
        pass
    def __init_parameter_(self, train_data, label_data):
        self.train_data = train_data
        self.label_data = label_data
        # 初始化训练数据和训练标签
        self.sample_num = len(train_data)
        # 训练数据的数量
        self.dimension = len(train_data[0])
        # 每个训练数据的维度
        self.weight = [1.0 / self.sample_num] * self.sample_num
        # 初始化权重向量
        self.num_classifier = 50
        # 使用50个弱分类函数
        self.alpha = []
        # 记录每个分类函数的系数
        self.classifier = []
        # 保存每个分类器，其中包括了分类器和对应的分类对应的维度
    def normalization_z(self, classifier, index):
        """
        :param classifier: 当前分类函数
        :param index: 分类函数所切分的位置
        :return: 规范化因子
        """
        # 计算规范化因子
        zm = 0
        for i in range(self.sample_num):
            zm += self.weight[i] * math.exp(
                -1 * self.alpha[-1] * self.label_data[i] * classifier.predict(self.train_data[i][index]))
        # 对于训练数据中的每个样本，计算其是否被误分类计算得到规范化因子
        return zm
    def updata_weight(self, classifier, index, zm):
        # 更新权重
        for i in range(len(self.weight)):
            self.weight[i] = (self.weight[i] * math.exp(
                -1 * self.alpha[-1] * self.label_data[i] * classifier.predict(self.train_data[i][index]))) / zm
        # 遍历权重向量的每个位置，然后根据分类器和切分位置更新权重
    def train(self, train_data, label_data):
        self.__init_parameter_(train_data, label_data)
        # 初始各项数据
        for num in range(self.num_classifier):
            # 训练10个分类函数
            print("the " + str(num) + "weak classifier")
            best_classifier = (None, None, 100000)
            # 初始化一个空的分类函数
            for index in range(self.dimension):
                # 对于每一个维度都进行训练一个分类器
                classifier = sign(self.train_data[:, index], self.label_data, self.weight)
                score = classifier.train()
                if score < best_classifier[2]:
                    best_classifier = (classifier, index, score)
            # 保存下所有维度中分类误差率最小的那个
            em = best_classifier[2]
            if em == 0:
                self.alpha.append(100)
            else:
                self.alpha.append(0.5 * math.log((1 - em) / em))
            self.classifier.append([best_classifier[0], best_classifier[1]])
            # 保存分类器和切分位置
            zm = self.normalization_z(best_classifier[0], best_classifier[1])
            # 计算规范化因子
            self.updata_weight(best_classifier[0], best_classifier[1], zm)
            # 更新权重向量
    def _predict_(self, feature):
        # 预测单个样本的便签
        result = 0.0
        for i in range(self.num_classifier):
            # 取出一个一个的分类器
            classifier = self.classifier[i][0]
            # 取出切分位置
            index = self.classifier[i][1]
            # 计算所有分类器的和
            result += self.alpha[i] * classifier.predict(feature[index])
        if result > 0:
            return 1
        else:
            return -1
    def predict(self, features):
        # 计算所有样本的预测
        results = []
        for i in range(len(features)):
            results.append(self._predict_(features[i]))
        return results
if __name__ == '__main__':
    print("Start reading the data:")
    time1 = time.time()
    raw_path = r'F:\work and learn\ML\dataset\MNIST\train.csv'
    raw_data = np.loadtxt(raw_path, delimiter=',', skiprows=1)
    # 读入原始数据
    label = raw_data[:, 0]
    data = raw_data[:, 1:]
    # 区分标签数据和原始数据
    data = binary(data, 80)
    label[label != 1] = -1
    # 进行二值化和对标签二值化（这边做的是二分类）
    train_data, test_data, train_label, test_label = train_test_split(data, label, test_size=0.333, random_state=23333)
    # 将数据随机切分用于训练和预测
    time2 = time.time()
    print("read data cost:", time2 - time1, " seconds", '\n')
    print("Start training:")
    ada = adaBoost()
    ada.train(train_data, train_label)
    time3 = time.time()
    print("training cost: ", time3 - time2, " seconds", '\n')
    # 训练模型并打印训练模型所用时间
    print("start predicting:")
    result = ada.predict(test_data)
    time4 = time.time()
    print("predict cost: ", time4 - time3, " seconds", '\n')
    # 预测测试数据的标签
    score = accuracy_score(result, test_label)
    print("the accuracy is: ", score)
    # 打印正确率

预测结果：

在设定弱分类器个数为50个的情况下，adaboost模型在mnist上的表现还是不错的：