前言:这篇是代码部分,不会涉及原理部分的阐述,但整个程序的实现会分为2种,一种是纯手工代码,不用调库,第二种方法是会借用sklearn的库来实现。
这里使用的k近邻案例是我们比较熟悉的手写数值的识别,其中我会把训练数据、测试数据、程序放在一个同一文件下。
实现方法一:
from numpy import *
from os import listdir
import operator
import time
#这里是一段装饰器,是为了测试程序的运行时间
def wrapper(func):
def warrper():
starttime = time.time()
func()
pretime = time.time()
runtime = (pretime-starttime)
print("the running time:",runtime)
return warrper
# 计算距离然后对距离进行排序,取前k项较小的,并返回其中类别最多的一个
def classify0(inX,dataSet,labels,k):
dataSetsize = dataSet.shape[0]
#这里说下,tile函数可以将inx在行上复制dataSetsize遍,在列上复制1遍
diffMat = tile(inX,(dataSetsize,1))-dataSet
sqdiffMat=diffMat**2
sqdistance = sqdiffMat.sum(axis=1)
Distance =sqdistance**0.5
sorteddistances = Distance.argsort()
classCount = {}
for i in range(k):
votelable = labels[sorteddistances]
classCount[votelable] = classCount.get(votelable,0)+1
sortedClasscount = sorted(classCount.items(),key = operator.itemgetter(1),reverse=True)
return sortedClasscount[0][0]
# 将图像格式处理为一个向量
def img2vetor(filename):
returnVect = zeros((1,1024))
fr = open(filename)
for i in range(32):
linestr = fr.readline()
for j in range(32):
returnVect[0,32*i+j] = int(linestr[j])
return returnVect
# 调用训练数据和测试数据
@wrapper
def handwritingclasstest():
hwLables = []
trainingfileList = listdir('./trainingDigits')
m = len(trainingfileList)
trainingMat = zeros((m,1024))
for i in range(m):
# print(trainingfileList)
filenamestr = trainingfileList
filestr = filenamestr.split('.')[0]
classNumstr = int(filestr.split('_')[0])
hwLables.append(classNumstr)
trainingMat[i,:] = img2vetor('./trainingDigits/%s'%filenamestr)
testfileList = listdir('./testDigits')
errorcount = 0
mTest = len(testfileList)
for j in range(mTest):
testfilename = testfileList[j]
testclassNum = int(testfilename.split('_')[0])
vectorUndertest = img2vetor('./testDigits/%s'%testfilename)
classResult = classify0(vectorUndertest,trainingMat,hwLables,3)
print('the classifier come back with:%d,the real answer is %d' %(classResult,testclassNum))
if classResult!=testclassNum:
errorcount +=1
print("\nthe totle error is %d" %errorcount)
print("\nthe totle error rate is %f"%(errorcount/float(mTest)))
handwritingclasstest()
程序运行结果:
the totle error is 10
the totle error rate is 0.010571
the running time: 38.14647126197815
Process finished with exit code 0
实现方法二:
from numpy import *
from os import listdir
from sklearn.neighbors import KNeighborsClassifier
import time
def wrapper(func):
def warrper():
starttime = time.time()
func()
pretime = time.time()
runtime = (pretime-starttime)
print(runtime)
return warrper
# 这一步是必须的,要把图像转化成一维向量
def img2vector(filename):
returnVector = zeros((1,1024))
fr = open(filename)
for i in range(32):
linestr = fr.readline()
for j in range(32):
returnVector[0,32*i+j]=int(linestr[j])
return returnVector
# 获取训练了数据的图像数据,并转化为向量
def training2vetor():
trainingfileList = listdir('./trainingDigits')
m = len(trainingfileList)
trainMat = zeros((m,1024))
hwLabels = []
for i in range(m):
trainMat[i, :] = img2vector('./trainingDigits/%s' % trainingfileList)
trainingNum = int(trainingfileList.split('_')[0])
hwLabels.append(trainingNum)
return trainMat,hwLabels
# 对测试数据进行测试
@wrapper
def testclass():
clf = KNeighborsClassifier(n_neighbors=3,algorithm='kd_tree',n_jobs=-1)
trainMat,hwLabels = training2vetor()
clf.fit(trainMat,hwLabels)
testclassList = listdir('./testDigits')
mTest = len(testclassList)
errorcount = 0
testLabels = []
for i in range(mTest):
testname = testclassList
testNum = int(testname.split('_')[0])
testLabels.append(testNum)
testVector = img2vector('./testDigits/%s'%testclassList)
testResult=clf.predict(testVector)
if testResult!=testNum:
errorcount+=1
print("\nthe totle error is %d" % errorcount)
print("\nthe totle error rate is %f" % (errorcount / float(mTest)))
testclass()
# 运行完后,明显发现调用库比纯手写的代价执行效率要低,故安装一个装饰器来对比两个程序运行时间
程序运行结果:
the totle error is 12
the totle error rate is 0.012685
103.89552760124207
Process finished with exit code 0
最后,写一写自己这次实战的感受,由于在调库的这个版本中,程序的运行时间有点长,故自己编码了装饰器来测试程序的运行时间,果不其然,第二个方法(调库 )不仅在效率上较差,而且在准确率上也比纯手写的(不掉库)低,这点我自己很迷惑,如果有朋友看到我的这个问题而且知道部分原因的话,恳请下方留言,实在感谢。
---------------------
【转载】
作者:不曾走远~
原文:https://blog.csdn.net/qq_20412595/article/details/82466543
|
|