简单线性回归——梯度下降代码实现[Python] 纯文本查看 复制代码 import numpy as np
import matplotlib as plt
# 1. 导入数据(我们这里用一组现成的数据----data.csv)
points = np.genfromtxt("data.csv",delimiter=',')
N = len(points)
# 提取 x, y
# 每行的第一个列
x = np.array(points[:,0])
# 每行的第二列
y = np.array(points[:,1])
# 用 plt 做出散点图
plt.scatter(x,y)
plt.show
# 2. 定义损失函数
# 定义线性模式 y = mx + b
def compute_cost(points, b, m):
total_cost = 0
N = len(points)
# 计算平方损失函数(计算所有误差平方和)
# sum(y - mx - b)^2
for i in range(N):
x = points[i,0]
y = points[i,1]
total_cost = total_cost + (y - m * x - b)**2
# 返回平均误差
reutrn total_cost / float(N)
# 3. 定义模型的超参数
# 包括:步长(学习速率),初始点,迭代次数
learning_rate = 0.0001
initial_b = 0
initial_m = 0
num_iteration = 10
# 4. 实现核心算法(梯度下降)
def gradient_descent(points, initial_b, initial_m, learning_rate,num_iteration):
b = initial_b
m = initial_m
# 用一个 list 记录所有的损失函数值
cost_list = []
for i in range(num_iteration):
# 计算损失函数
cost_list.append(compute_cost(points, b, m))
b, m = step_grad_desc(b, m, np.array(points), learning_rate)
return[b, m, cost_list]
# 每一步的梯度下降
def step_grad_desc(current_b, current_m, points, learning_rate):
m_grade = 0
b_grade = 0
N = len(points)
for i in range(N):
x = points[i, 0]
y = points[i, 1]
m_grade += (current_m * x + current_b - y) * x
b_grade += current_m * x + current_b
m_update = current_m - learning_rate * m_grade * (2 / float(N))
b_update = current_b - learning_rate * b_grade * (2 / float(N))
return b_update, m_update
# 5. 测试:运行梯度下降函数来计算最优 m, b
b, m, cost_list = gradient_descent(points, initial_b,initial_m, learning_rate, num_iteration)
print("final m is: ", m)
print("final b is: ", b)
print(cost_list)
# 6. 测一下损失函数
print("final cost: ", compute_cost(points, b, m))
# 7. 画出损失函数随着迭代下降的过程
plt.plot(cost_list)
plt.show()
# 8. 画出拟合曲线
plt.scatter(x, y)
y_pred = m * x + b
plt.plot(x, y_pred, c='r')
plt.show()
K 近邻(KNN)算法(找自己最近的邻居)- 最简单最初级的分类器,就是将全部的训练数据所对应的类别都记录下来
- 当测试对象的属性和某个训练对象的属性完全匹配时,便可以对其进行分类
- KNN 是一种基本分类方法,通过测量不同特征值之间的距离进行分类
- 如果一个样本在特征空间中的 k 哥最相似的样本中的大数据属于一个类别,则该样本也属于这个类别
- 其中 k 通常是不大于 20 的证书
- KNN 算法中,所选择的邻居都是已经正确的分类的对象
- KNN 算法的结果很大程度取决于K的选择
[Python] 纯文本查看 复制代码 # 添加依赖
import numpy as np
# 数值计算,数值分析库
import pandas as pd
# 里面有实例数据
from sklearn.datasets import load_iris
# 切分数据集(训练集|数据集)
from sklearn.model_selection import train_test_split
# 准确度评分(计算分类数据的预测准确度)
from sklearn.metrics import accuracy_score
if __name__ == '__main__':
# 数据加载和预处理(json格式)
iris = load_iris()
# print(iris)
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
# 添加一个属性【类别】标签
df['class'] = iris.target
df['class'] = df['class'].map({0: iris.target_names[0],
1: iris.target_names[1],
2: iris.target_names[2]})
# 默认返回5条数据
print(df.head())
# 查看描述(统计数据)
print(df.describe())
# 二维数组数据
x = iris.data
# 分类结果列表(重新排列成二维数组)
y = iris.target.reshape(-1, 1)
print(x.shape, y.shape)
# (150, 4) (150, 1)
# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify=y)
print("x_train.shape, y_train.shape")
print(x_train.shape, y_train.shape)
print("x_test.shape, y_test.shape")
print(x_test.shape, y_test.shape)
# 真正的算法实现
# 距离函数
def l1_dis(a, b):
"""计算2个一维向量距离,axis 表示求和之后最后形成的是1列(0表示行)"""
return np.sum(np.abs(a - b), axis=1)
def l2_dis(a, b):
return np.sqrt(np.sum((a - b) ** 2, axis=1))
# 分类器的实现
class kNN(object):
def __init__(self, k_neighbors=1, disc_func=l1_dis):
"""类的构造方法"""
self.k_neighbors = k_neighbors
self.disc_func = disc_func
def fit(self, x, y):
self.x_train = x
self.y_train = y
def predict(self, test):
# 预测数组初始化为0
y_pred = np.zeros((test.shape[0], 1), dtype=self.y_train.dtype)
for i, x_test in enumerate(test):
# 计算矩阵距离
distances = self.disc_func(self.x_train, x_test)
# 按距离大小排序(取出对应索引值)
nn_index = np.argsort(distances)
# 取前 k 个值,计算分类频率
nn_pred = self.y_train[nn_index[:self.k_neighbors]].ravel()
y_pred[i] = np.argmax(np.bincount(nn_pred))
return y_pred
# 测试
knn = kNN(k_neighbors=5)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
# print("y_pred = {}".format(y_pred))
print("分类准确率:{}%".format(accuracy_score(y_test, y_pred) * 100))
|
|