[学习交流] 【广州校区】+【原创】线性回归

简单线性回归——梯度下降代码实现

[Python] 纯文本查看 复制代码

import numpy as np
import matplotlib as plt

# 1. 导入数据（我们这里用一组现成的数据----data.csv）
points = np.genfromtxt("data.csv",delimiter=',')
N = len(points)

# 提取 x, y 
# 每行的第一个列
x = np.array(points[:,0])
# 每行的第二列
y = np.array(points[:,1])

# 用 plt 做出散点图
plt.scatter(x,y)
plt.show

# 2. 定义损失函数
# 定义线性模式 y = mx + b
def compute_cost(points, b, m):
    total_cost = 0
    N = len(points)
    # 计算平方损失函数(计算所有误差平方和)
    # sum(y - mx - b)^2
    for i in range(N):
        x = points[i,0]
        y = points[i,1]
        total_cost = total_cost + (y - m * x - b)**2
    # 返回平均误差
    reutrn total_cost / float(N)

# 3. 定义模型的超参数
# 包括：步长(学习速率)，初始点，迭代次数
learning_rate = 0.0001
initial_b = 0
initial_m = 0
num_iteration = 10

# 4. 实现核心算法(梯度下降)
def gradient_descent(points, initial_b, initial_m, learning_rate,num_iteration):
    b = initial_b
    m = initial_m
    
    # 用一个 list 记录所有的损失函数值
    cost_list = []
    
    for i in range(num_iteration):
        # 计算损失函数
        cost_list.append(compute_cost(points, b, m))
        b, m = step_grad_desc(b, m, np.array(points), learning_rate)
    return[b, m, cost_list]

# 每一步的梯度下降
def step_grad_desc(current_b, current_m, points, learning_rate):
    m_grade = 0
    b_grade = 0
    N = len(points)
    
    for i in range(N):
        x = points[i, 0]
        y = points[i, 1]
        m_grade += (current_m * x + current_b - y) * x
        b_grade += current_m * x + current_b
    m_update = current_m - learning_rate * m_grade * (2 / float(N))
    b_update = current_b - learning_rate * b_grade * (2 / float(N))
    return b_update, m_update

# 5. 测试：运行梯度下降函数来计算最优 m, b
b, m, cost_list = gradient_descent(points, initial_b,initial_m, learning_rate, num_iteration)
print("final m is: ", m)
print("final b is: ", b)
print(cost_list)

# 6. 测一下损失函数
print("final cost: ", compute_cost(points, b, m))

# 7. 画出损失函数随着迭代下降的过程
plt.plot(cost_list)
plt.show()

# 8. 画出拟合曲线
plt.scatter(x, y)
y_pred = m * x + b
plt.plot(x, y_pred, c='r')
plt.show()

K 近邻(KNN)算法(找自己最近的邻居)

最简单最初级的分类器，就是将全部的训练数据所对应的类别都记录下来
- 当测试对象的属性和某个训练对象的属性完全匹配时，便可以对其进行分类
KNN 是一种基本分类方法，通过测量不同特征值之间的距离进行分类
- 如果一个样本在特征空间中的 k 哥最相似的样本中的大数据属于一个类别，则该样本也属于这个类别
- 其中 k 通常是不大于 20 的证书
KNN 算法中，所选择的邻居都是已经正确的分类的对象
KNN 算法的结果很大程度取决于K的选择
- K 一般选择奇数

[Python] 纯文本查看 复制代码

# 添加依赖
import numpy as np
# 数值计算，数值分析库
import pandas as pd
# 里面有实例数据
from sklearn.datasets import load_iris
# 切分数据集(训练集|数据集)
from sklearn.model_selection import train_test_split
# 准确度评分（计算分类数据的预测准确度）
from sklearn.metrics import accuracy_score

if __name__ == '__main__':
    # 数据加载和预处理(json格式)
    iris = load_iris()
    # print(iris)
    df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
    # 添加一个属性【类别】标签
    df['class'] = iris.target
    df['class'] = df['class'].map({0: iris.target_names[0],
                                   1: iris.target_names[1],
                                   2: iris.target_names[2]})
    # 默认返回5条数据
    print(df.head())
    # 查看描述(统计数据)
    print(df.describe())
    # 二维数组数据
    x = iris.data
    # 分类结果列表(重新排列成二维数组)
    y = iris.target.reshape(-1, 1)
    print(x.shape, y.shape)
    # (150, 4) (150, 1)

    # 划分训练集和测试集
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify=y)
    print("x_train.shape, y_train.shape")
    print(x_train.shape, y_train.shape)
    print("x_test.shape, y_test.shape")
    print(x_test.shape, y_test.shape)


    # 真正的算法实现
    # 距离函数
    def l1_dis(a, b):
        """计算2个一维向量距离,axis 表示求和之后最后形成的是1列(0表示行)"""
        return np.sum(np.abs(a - b), axis=1)


    def l2_dis(a, b):
        return np.sqrt(np.sum((a - b) ** 2, axis=1))


    # 分类器的实现
    class kNN(object):
        def __init__(self, k_neighbors=1, disc_func=l1_dis):
            """类的构造方法"""
            self.k_neighbors = k_neighbors
            self.disc_func = disc_func

        def fit(self, x, y):
            self.x_train = x
            self.y_train = y

        def predict(self, test):
            # 预测数组初始化为0
            y_pred = np.zeros((test.shape[0], 1), dtype=self.y_train.dtype)
            for i, x_test in enumerate(test):
                # 计算矩阵距离
                distances = self.disc_func(self.x_train, x_test)
                # 按距离大小排序(取出对应索引值)
                nn_index = np.argsort(distances)
                # 取前 k 个值，计算分类频率
                nn_pred = self.y_train[nn_index[:self.k_neighbors]].ravel()
                y_pred[i] = np.argmax(np.bincount(nn_pred))
            return y_pred


    # 测试
    knn = kNN(k_neighbors=5)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    # print("y_pred = {}".format(y_pred))
    print("分类准确率：{}%".format(accuracy_score(y_test, y_pred) * 100))

帐号		自动登录	找回密码
密码			加入黑马

[学习交流] 【广州校区】+【原创】线性回归

0 个回复