使用 MNIST 数据集源码级分别实现朴素贝叶斯、PCA、Kmeans 算法

朴素贝叶斯算法

贝叶斯公式:$p(y|x)=\frac{p(x|y)p(y)}{p(x)}$

预处理:将图片像素二值化

In [7]:
def init(x):  #图片像素二值化,变为0-1分布
    m,n=x.shape
    for i in range(m):
        for j in range(n):
            if x[i][j]!=0:
                x[i][j]=1
    return x

p(y)指的是在测试样本中标签为0~9的图片的频率

p(x|y)指的是在标签为0~9的图片中每个像素点0,1分布情况

若x1=0,x2=1,p(x|y)=p(x1,x2|y)=p(x1|y)p(x2|y)

In [37]:
def train_p(x_train,y_train,classnum):  #计算概率
    x_train=init(x_train)
    m,n=x_train.shape
    p_y=np.zeros(classnum)  #先验概率p(y)
    p_c=np.zeros((classnum,n,2)) #条件概率p(x|y)
    for i in range(m):
        p_y[y_train[i]]+=1   
        for j in range(n):
            p_c[y_train[i],j,x_train[i][j]]+=1
    p_y=p_y/m
    for i in range(classnum):
        for j in range(n):
            p_c[i][j][0]=p_c[i][j][0]/(p_c[i][j][0]+p_c[i][j][1])
            p_c[i][j][1] = p_c[i][j][1] / (p_c[i][j][0] + p_c[i][j][1])
    return p_y,p_c
In [9]:
def cal_p(x,num,p_y,p_c):  #计算概率 num为0~9的任意一个数字 x代表一张图片,形状为(1,784)
    n=x.shape[0]
    p=p_y[num]
    for i in range(n):
        p*=p_c[num][i][x[i]]
    return p
In [10]:
def predict(x_test,y_test,classnum,x_train,y_train):  #进行预测
    p_y,p_c=train_p(x_train,y_train,classnum)
    x_test=init(x_test)
    m,n=x_test.shape
    count=0
    for i in range(m):
        max_p=-1
        index=-1
        for j in range(classnum):  #选取0~9中可能性最大的标签
            pro_result=cal_p(x_test[i],j,p_y,p_c)
            if(max_p<pro_result):
                max_p=pro_result
                index=j
        #print("第{}个测试集预测为{}  标签为{}".format(i,index,y_test[i]))
        if(index==y_test[i]):
             count+=1
    score=count/m
    return  score

PCA算法

In [3]:
def pca(x,k):
    u=np.mean(x,axis=0)  #计算均值
    x=x-u  #均值归一化
    sigma=np.cov(x,rowvar=0)  #计算协方差矩阵
    w,v=np.linalg.eig(sigma)  #w为特征值 v为特征向量
    index=np.argsort(-w)  #特征值从大到小排序,返回索引
    index_change=index[:k]  #取前k个 
    v_change=v[:,index_change] 
    z=x.dot(v_change) #计算x在v_change坐标下的位置z
    return z

Kmeans算法

In [50]:
def init_c(x,k):    #初始化聚,从样本中随机选择k个
    m,n=x.shape
    c=np.zeros((k,n))
    idx=np.random.randint(0,m,k)
    for i in range(k):
        c[i,:]=x[idx[i],:]
    return c

计算$\min_{k}||x^{(i)}-c_k||^2$

In [51]:
def compute_c(x,c):    #计算每个样本到所有聚的距离的最小值
    m,n=x.shape
    k=c.shape[0]
    idx=np.zeros(m)  #idx存放的是每个样本的预测标签
    for i in range(m):
        min_distance=100000
        for j in range(k):
            distance=np.sum((x[i,:]-c[j,:])**2)
            if(distance<min_distance):
                min_distance = distance
                idx[i]=j
    return idx
In [55]:
def updata_c(x,k,idx): #更新聚
    m,n=x.shape
    c=np.zeros((k,n))  #存放的是聚
    for i in range(k):
        indices=np.where(idx==i)  #取出所有为同一类标签的样本
        c[i,:]=(np.sum(x[indices,:],axis=1)/len(indices[0])).ravel()
    return c
In [56]:
def run_one_kmeans(x,k,repeate):  #不断进行迭代,找出最优的聚中心
    c=init_c(x,k)  #初始化矩阵聚
    m,n=x.shape
    idx=np.zeros(m)
    for i in range(repeate):
        idx=compute_c(x,c)
        c=updata_c(x,k,idx)
    return idx,c

计算样本点到该样本点的聚的最小值$\min\frac{1}{m}\sum_{i=0}^{m}||x^{(i)}-c_{idx[j]}||^2$

In [2]:
def run_all_kmeans(x,k,repeate,r_min): #重复随机生成聚,获取最小的
    m,n=x.shape
    min_cost=1000000000
    result_idx=np.zeros(m)
    result_c=np.zeros((k,n))
    for i in range(r_min):
        idx,c=run_one_kmeans(x,k,repeate)
        cost=0
        for j in range(m):
            cost+= np.sum((x[j,:] - c[int(idx[j]),:]) ** 2)
        cost=cost*1.0/m
        if(min_cost>cost):
            min_cost=cost
            result_idx=idx
            result_c=c
    return result_idx,result_c

论文实现

对训练集的图像进行随机切块处理,将每张图像随机切割为k张大小为size*size的块

In [ ]:
def randomcut(x_train,k,size):  #随机切片成size*size大小的图片
    m,nx,ny=x_train.shape
    x=np.zeros((m,k,size,size))
    for i in range(m):
        for j in range(k):
            idx=np.random.randint(0,27-size)
            idy=np.random.randint(0,27-size)
            x[i][j]= x_train[i][idx:idx+size,idy:idy+size]
    return x

对数据进行预处理(均值归一化),其公式为$x=\frac{x-u}{s}$ u为均值,s为标准差

In [ ]:
def init(x_train):#进行数据预处理
    u=np.mean(x_train,axis=0)  # 计算均值
    s=np.std(x_train,axis=0)
    x_train=(x_train-u)/s
    return x_train

对随机切片的数据集用上述kmeans算法进行聚类,聚成30类

对训练集的图像进行规则切块处理,将每张图像切割为k张大小为size*size的块

In [ ]:
def rulecut(x_test,k,size): #有规律的切片,k必须为平方
    m,nx,ny=x_test.shape
    x=np.zeros((m,k,size,size))
    for i in range(m):
        for j in range(k//8):
            for t in range(8):
                x[i][j*8+t]=x_test[i][t*3:t*3+size,j*3:j*3+size]
    return x

计算规则切割后的每张图像到聚类中心的距离,将最小的距离设为1,其他为0

In [ ]:
def xtrain_work(x, m):  # m为样本数
    pool_param = {'pool_height': 4, 'pool_width': 4, 'stride': 4}
    x_cut = ac.randomcut(x, 20, 6)  # 随机切割,将一张图片切割为20张6*6
    X_cut = ac.rulecut(x, 64, 6)  # 规则切割,将一张图片切割成64张6*6
    x_cut = x_cut.reshape(x_cut.shape[0] * x_cut.shape[1], x_cut.shape[2] * x_cut.shape[3])
    X_cut = X_cut.reshape(X_cut.shape[0] * X_cut.shape[1], X_cut.shape[2] * X_cut.shape[3])
    x_cut_init = init(x_cut)  # 数据预处理
    X_cut_init = init(X_cut)
    jnum = 30 # 聚类数目
    # result_idx,result_c=kmeans.run_all_kmeans(x_cut_init,jnum,30,10)    #进行聚类
    clf_kmeans = KMeans(n_clusters=jnum).fit(x_cut_init)
    result_c = clf_kmeans.cluster_centers_
    result = np.zeros((m, jnum, 64))
    for i in range(m):  # 进行转化
        for j in range(64):
            min = 1000000000
            index = -1
            for k in range(jnum):
                dist = np.sum((X_cut_init[i * 64 + j] - result_c[k]) ** 2)
                if min > dist:
                    min = dist
                    index = k
            result[i][index][j] = 1
    result = result.reshape(m, jnum, 8, 8)
    x_result = average_pooling_forward(result, pool_param)
    x_result = x_result.reshape(x_result.shape[0], x_result.shape[1] * x_result.shape[2] * x_result.shape[3])
    return x_result, result_c

对新的数据进行池化(平均值)

In [ ]:
def average_pooling_forward(x, pool_param):  # 向前池化
    N, C, H, W = x.shape
    HH, WW, stride = pool_param['pool_height'], pool_param['pool_width'], pool_param['stride']
    H_out = int((H - HH) / stride + 1)
    W_out = int((W - WW) / stride + 1)
    out = np.zeros((N, C, H_out, W_out))
    for i in range(H_out):
        for j in range(W_out):
            x_mask = x[:, :, i * stride:i * stride + HH, j * stride:j * stride + WW]
            out[:, :, i, j] = np.mean(x_mask, axis=(2, 3))
    return out

对测试集的图像进行规则切块处理,将每张图像切割为64张大小为6*6的块,并计算每张图像到聚类中心的距离,将最小的距离设为1,其他为0,再进行池化

In [ ]:
def xtest_work(x, m, result_c):
    pool_param = {'pool_height': 4, 'pool_width': 4, 'stride': 4}
    jnum = result_c.shape[0]
    X_cut = ac.rulecut(x, 64, 6)  # 规则切割,将一张图片切割成64张6*6
    X_cut = X_cut.reshape(X_cut.shape[0] * X_cut.shape[1], X_cut.shape[2] * X_cut.shape[3])
    X_cut_init = init(X_cut)
    result = np.zeros((m, jnum, 64))
    for i in range(m):  # 进行转化
        for j in range(64):
            min = 1000000000
            index = -1
            for k in range(jnum):
                dist = np.sum((X_cut_init[i * 64 + j] - result_c[k]) ** 2)
                if min > dist:
                    min = dist
                    index = k
            result[i][index][j] = 1
    result = result.reshape(m, jnum, 8, 8)
    x_result = average_pooling_forward(result, pool_param)
    x_result = x_result.reshape(x_result.shape[0], x_result.shape[1] * x_result.shape[2] * x_result.shape[3])
    return x_result

用SVM分类器进行分类

In [ ]:
x_train_result, result_c = xtrain_work(X_train, mt)
clf = SVC(gamma='auto',kernel='rbf')
clf.fit(x_train_result, y_train)
x_test_result = xtest_work(X_test, ms, result_c)
print("SVM准确率为{}".format(clf.score(x_test_result,y_test)))
clf=tree.DecisionTreeClassifier()    #决策树
clf=clf.fit(x_train_result,y_train)
print("决策树准确率为{}".format(clf.score(x_test_result,y_test)))

实验结果

实验数据采用训练集20000张图片,测试集20000张图片进行操作,通过修改聚类中心的数目,使用Scikit-Learn框架中的kmeans算法,以及使用不同的分类器等操作进行测试,可以得到不同的结果,通过统计可以得到的表格如下

\begin{array}{c} \text{聚类数目} & \text{聚类算法} & \text{分类器} & \text{准确率} \\ \hline 30 & Scikit-Learn框架 & SVM & 0.6999\\ \hline 30 & Scikit-Learn框架 & 决策树 & 0.7649\\ \hline 20 & Scikit-Learn框架 & SVM & 0.7765\\ \hline 20 & Scikit-Learn框架 & 决策树 & 0.7741\\ \hline 20 & 自己编写 & 决策树 & 0.7723\\ \hline 20 & 自己编写 & SVM & 0.7784\\ \hline \end{array}

通过实验可以发现,不同的聚类中心,不同的聚类算法以及不同的分类器得到的准确率不一样,总体来说用SVM的分类器算出的准确率会较高