#非NaN特征数大于N个行
df2=df.dropna(thresh=N)
如原始df:
0 1 2
0 0.709057 NaN NaN
1 -1.483965 NaN NaN
2 -0.429491 NaN NaN
3 1.226442 NaN 0.996761
4 0.424773 NaN 2.122809
5 1.083828 0.646571 0.594823
6 -0.870858 0.289760 -0.014352
#字典去重,数据量小才可以,大数据不行,内存和运算速度都不够
dict={}
key=0
for line in fr.readlines():
if line not in dict.values():
dict[key]=line
key+=1
print ("the distinct records = "+str(len(dict)))
去除空行
for line in fr.readlines():
if len(line)>2:#去除空行和内容特别少的行
print line
去除换行符