K-近邻算法:
# coding:utf-8from numpy import *import Operatordef classify0(inx,dataset,lables,k): row_size=dataset.shape[0] # 相当于x1-x2 y1-y2 diff_mat=tile(inx,(row_size,1))-dataset #平方 sq_diff_mat=diff_mat**2 # 行相加 sq_distances=sq_diff_mat.sum(axis=1) # 开根号 distances=sq_distances**0.5 # 从小到大排序之后返回索引的位置 sort_distance=argsort(distances) class_count={} for i in range(k): vote_lable=lables[sort_distance[i]] # 计算各个分类的数量 class_count[vote_lable]=class_count.get(vote_lable,0)+1 # 返回出现次数最多的分类 返回一个tuple list 第一个是Key 第二个是Value sort_class_count=sorted(class_count.iteritems(),key=operator.itemgetter(1),reverse=True) return sort_class_count[0][0]def file2matrix(filrname): fr=open(filrname) arraylines=fr.readlines() line_count=len(arraylines) # 生成为0的矩阵 mat=zeros((line_count,3)) # 存储分类信息 class_labels=[] index=0 for line in arraylines: line=line.strip() list_line=line.split('/t') mat[index,:]=list_line[0:3] class_labels.append(int(list_line[-1])) index+=1 return mat,class_labels## new_value=(old_value-min_value)/rangedef auto_norm(dataset): min_value=dataset.min(0) max_value=dataset.max(0) range=max_value-min_value norm_data=zeros(shape(dataset)) row_count=dataset.shape[0] # old-min norm_data=dataset-tile(min_value,(row_count,1)) norm_data=norm_data/tile(max_value,(row_count,1)) return norm_data,range,min_valuedef data_class_test(): hoRatio=0.10 data_mat,class_lables=file2matrix('datingTestSet2.txt') norm_data,ranges,min_value=auto_norm(data_mat) row_count=norm_data.shape[0] numTest_vecs=int(row_count*hoRatio) error_count=0.0 for i in range(numTest_vecs): result=classify0(norm_data[i,:],norm_data[numTest_vecs:row_count,:] ,class_lables[numTest_vecs:row_count],3) PRint('预测结果是:%s 真实结果是:%s'%(result,class_lables[i])) if(result!=class_lables[i]):error_count+=1.0 print('错误率是:%f'%(error_count/float(numTest_vecs)))data_class_test()代码摘抄自:机器学习实战
新闻热点
疑难解答