使用 MNIST 数据集源码级分别实现朴素贝叶斯、PCA、Kmeans 算法
朴素贝叶斯算法
贝叶斯公式:$p(y|x)=\frac{p(x|y)p(y)}{p(x)}$
预处理:将图片像素二值化
def init(x): #图片像素二值化,变为0-1分布
m,n=x.shape
for i in range(m):
for j in range(n):
if x[i][j]!=0:
x[i][j]=1
return x
p(y)指的是在测试样本中标签为0~9的图片的频率
p(x|y)指的是在标签为0~9的图片中每个像素点0,1分布情况
若x1=0,x2=1,p(x|y)=p(x1,x2|y)=p(x1|y)p(x2|y)
def train_p(x_train,y_train,classnum): #计算概率
x_train=init(x_train)
m,n=x_train.shape
p_y=np.zeros(classnum) #先验概率p(y)
p_c=np.zeros((classnum,n,2)) #条件概率p(x|y)
for i in range(m):
p_y[y_train[i]]+=1
for j in range(n):
p_c[y_train[i],j,x_train[i][j]]+=1
p_y=p_y/m
for i in range(classnum):
for j in range(n):
p_c[i][j][0]=p_c[i][j][0]/(p_c[i][j][0]+p_c[i][j][1])
p_c[i][j][1] = p_c[i][j][1] / (p_c[i][j][0] + p_c[i][j][1])
return p_y,p_c
def cal_p(x,num,p_y,p_c): #计算概率 num为0~9的任意一个数字 x代表一张图片,形状为(1,784)
n=x.shape[0]
p=p_y[num]
for i in range(n):
p*=p_c[num][i][x[i]]
return p
def predict(x_test,y_test,classnum,x_train,y_train): #进行预测
p_y,p_c=train_p(x_train,y_train,classnum)
x_test=init(x_test)
m,n=x_test.shape
count=0
for i in range(m):
max_p=-1
index=-1
for j in range(classnum): #选取0~9中可能性最大的标签
pro_result=cal_p(x_test[i],j,p_y,p_c)
if(max_p<pro_result):
max_p=pro_result
index=j
#print("第{}个测试集预测为{} 标签为{}".format(i,index,y_test[i]))
if(index==y_test[i]):
count+=1
score=count/m
return score
PCA算法
def pca(x,k):
u=np.mean(x,axis=0) #计算均值
x=x-u #均值归一化
sigma=np.cov(x,rowvar=0) #计算协方差矩阵
w,v=np.linalg.eig(sigma) #w为特征值 v为特征向量
index=np.argsort(-w) #特征值从大到小排序,返回索引
index_change=index[:k] #取前k个
v_change=v[:,index_change]
z=x.dot(v_change) #计算x在v_change坐标下的位置z
return z
Kmeans算法
def init_c(x,k): #初始化聚,从样本中随机选择k个
m,n=x.shape
c=np.zeros((k,n))
idx=np.random.randint(0,m,k)
for i in range(k):
c[i,:]=x[idx[i],:]
return c
计算$\min_{k}||x^{(i)}-c_k||^2$
def compute_c(x,c): #计算每个样本到所有聚的距离的最小值
m,n=x.shape
k=c.shape[0]
idx=np.zeros(m) #idx存放的是每个样本的预测标签
for i in range(m):
min_distance=100000
for j in range(k):
distance=np.sum((x[i,:]-c[j,:])**2)
if(distance<min_distance):
min_distance = distance
idx[i]=j
return idx
def updata_c(x,k,idx): #更新聚
m,n=x.shape
c=np.zeros((k,n)) #存放的是聚
for i in range(k):
indices=np.where(idx==i) #取出所有为同一类标签的样本
c[i,:]=(np.sum(x[indices,:],axis=1)/len(indices[0])).ravel()
return c
def run_one_kmeans(x,k,repeate): #不断进行迭代,找出最优的聚中心
c=init_c(x,k) #初始化矩阵聚
m,n=x.shape
idx=np.zeros(m)
for i in range(repeate):
idx=compute_c(x,c)
c=updata_c(x,k,idx)
return idx,c
计算样本点到该样本点的聚的最小值$\min\frac{1}{m}\sum_{i=0}^{m}||x^{(i)}-c_{idx[j]}||^2$
def run_all_kmeans(x,k,repeate,r_min): #重复随机生成聚,获取最小的
m,n=x.shape
min_cost=1000000000
result_idx=np.zeros(m)
result_c=np.zeros((k,n))
for i in range(r_min):
idx,c=run_one_kmeans(x,k,repeate)
cost=0
for j in range(m):
cost+= np.sum((x[j,:] - c[int(idx[j]),:]) ** 2)
cost=cost*1.0/m
if(min_cost>cost):
min_cost=cost
result_idx=idx
result_c=c
return result_idx,result_c
论文实现
对训练集的图像进行随机切块处理,将每张图像随机切割为k张大小为size*size的块
def randomcut(x_train,k,size): #随机切片成size*size大小的图片
m,nx,ny=x_train.shape
x=np.zeros((m,k,size,size))
for i in range(m):
for j in range(k):
idx=np.random.randint(0,27-size)
idy=np.random.randint(0,27-size)
x[i][j]= x_train[i][idx:idx+size,idy:idy+size]
return x
对数据进行预处理(均值归一化),其公式为$x=\frac{x-u}{s}$ u为均值,s为标准差
def init(x_train):#进行数据预处理
u=np.mean(x_train,axis=0) # 计算均值
s=np.std(x_train,axis=0)
x_train=(x_train-u)/s
return x_train
对随机切片的数据集用上述kmeans算法进行聚类,聚成30类
对训练集的图像进行规则切块处理,将每张图像切割为k张大小为size*size的块
def rulecut(x_test,k,size): #有规律的切片,k必须为平方
m,nx,ny=x_test.shape
x=np.zeros((m,k,size,size))
for i in range(m):
for j in range(k//8):
for t in range(8):
x[i][j*8+t]=x_test[i][t*3:t*3+size,j*3:j*3+size]
return x
计算规则切割后的每张图像到聚类中心的距离,将最小的距离设为1,其他为0
def xtrain_work(x, m): # m为样本数
pool_param = {'pool_height': 4, 'pool_width': 4, 'stride': 4}
x_cut = ac.randomcut(x, 20, 6) # 随机切割,将一张图片切割为20张6*6
X_cut = ac.rulecut(x, 64, 6) # 规则切割,将一张图片切割成64张6*6
x_cut = x_cut.reshape(x_cut.shape[0] * x_cut.shape[1], x_cut.shape[2] * x_cut.shape[3])
X_cut = X_cut.reshape(X_cut.shape[0] * X_cut.shape[1], X_cut.shape[2] * X_cut.shape[3])
x_cut_init = init(x_cut) # 数据预处理
X_cut_init = init(X_cut)
jnum = 30 # 聚类数目
# result_idx,result_c=kmeans.run_all_kmeans(x_cut_init,jnum,30,10) #进行聚类
clf_kmeans = KMeans(n_clusters=jnum).fit(x_cut_init)
result_c = clf_kmeans.cluster_centers_
result = np.zeros((m, jnum, 64))
for i in range(m): # 进行转化
for j in range(64):
min = 1000000000
index = -1
for k in range(jnum):
dist = np.sum((X_cut_init[i * 64 + j] - result_c[k]) ** 2)
if min > dist:
min = dist
index = k
result[i][index][j] = 1
result = result.reshape(m, jnum, 8, 8)
x_result = average_pooling_forward(result, pool_param)
x_result = x_result.reshape(x_result.shape[0], x_result.shape[1] * x_result.shape[2] * x_result.shape[3])
return x_result, result_c
对新的数据进行池化(平均值)
def average_pooling_forward(x, pool_param): # 向前池化
N, C, H, W = x.shape
HH, WW, stride = pool_param['pool_height'], pool_param['pool_width'], pool_param['stride']
H_out = int((H - HH) / stride + 1)
W_out = int((W - WW) / stride + 1)
out = np.zeros((N, C, H_out, W_out))
for i in range(H_out):
for j in range(W_out):
x_mask = x[:, :, i * stride:i * stride + HH, j * stride:j * stride + WW]
out[:, :, i, j] = np.mean(x_mask, axis=(2, 3))
return out
对测试集的图像进行规则切块处理,将每张图像切割为64张大小为6*6的块,并计算每张图像到聚类中心的距离,将最小的距离设为1,其他为0,再进行池化
def xtest_work(x, m, result_c):
pool_param = {'pool_height': 4, 'pool_width': 4, 'stride': 4}
jnum = result_c.shape[0]
X_cut = ac.rulecut(x, 64, 6) # 规则切割,将一张图片切割成64张6*6
X_cut = X_cut.reshape(X_cut.shape[0] * X_cut.shape[1], X_cut.shape[2] * X_cut.shape[3])
X_cut_init = init(X_cut)
result = np.zeros((m, jnum, 64))
for i in range(m): # 进行转化
for j in range(64):
min = 1000000000
index = -1
for k in range(jnum):
dist = np.sum((X_cut_init[i * 64 + j] - result_c[k]) ** 2)
if min > dist:
min = dist
index = k
result[i][index][j] = 1
result = result.reshape(m, jnum, 8, 8)
x_result = average_pooling_forward(result, pool_param)
x_result = x_result.reshape(x_result.shape[0], x_result.shape[1] * x_result.shape[2] * x_result.shape[3])
return x_result
用SVM分类器进行分类
x_train_result, result_c = xtrain_work(X_train, mt)
clf = SVC(gamma='auto',kernel='rbf')
clf.fit(x_train_result, y_train)
x_test_result = xtest_work(X_test, ms, result_c)
print("SVM准确率为{}".format(clf.score(x_test_result,y_test)))
clf=tree.DecisionTreeClassifier() #决策树
clf=clf.fit(x_train_result,y_train)
print("决策树准确率为{}".format(clf.score(x_test_result,y_test)))
实验结果
实验数据采用训练集20000张图片,测试集20000张图片进行操作,通过修改聚类中心的数目,使用Scikit-Learn框架中的kmeans算法,以及使用不同的分类器等操作进行测试,可以得到不同的结果,通过统计可以得到的表格如下
通过实验可以发现,不同的聚类中心,不同的聚类算法以及不同的分类器得到的准确率不一样,总体来说用SVM的分类器算出的准确率会较高