萱仔求职系列——1.2_机器学习基础知识复习+部分代码实战
实战篇:
1、对用户需求分析
2、数据挖掘
3、数据预处理
4、特征工程——>
①向量化(把数据转化成向量才能机器学习)
②特征选择(WOE、IV)
③分类特征:映射编码,二进制编码、独热编码(数据的维度变得很大)
④文本特征:TF-TDF
⑤图像特征:像素点
⑥衍生特征:进入的数据已经不是最初的特征
5、选择模型——>参数调优
① 线性回归:用最小二乘法和梯度下降法拟合直线,用损失函数:残差平方和
多重共线性:使用的这些特征是否存在很强的
相关矩阵:相关系数,vif方差膨胀检验
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
| import numpy as np import matplotlib.pyplot as plt
# 生成样本数据 np.random.seed(0) X = 2 * np.random.rand(100, 1) y = 4 + 3 * X + np.random.randn(100, 1)
# 线性回归模型 class LinearRegression: def fit(self, X, y): X_b = np.c_[np.ones((len(X), 1)), X] self.theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y) def predict(self, X): X_b = np.c_[np.ones((len(X), 1)), X] return X_b.dot(self.theta)
# 创建模型并训练 model = LinearRegression() model.fit(X, y)
# 预测 X_new = np.array([[0], [2]]) y_predict = model.predict(X_new)
# 可视化 plt.plot(X_new, y_predict, "r-") plt.plot(X, y, "b.") plt.xlabel("$x_1$") plt.ylabel("$y$") plt.show()
|
② 逻辑斯蒂回归:线性回归+sigmiod ,预测的是概率,而不是目标函数的值,用最小二乘法和梯度下降法拟合参数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
| import numpy as np from sklearn.datasets import load_iris import matplotlib.pyplot as plt
# 生成样本数据 iris = load_iris() X = iris["data"][:, 3:] # 花瓣宽度 y = (iris["target"] == 2).astype(np.int)
# 逻辑回归模型 class LogisticRegression: def __init__(self, learning_rate=0.1, n_iter=1000): self.learning_rate = learning_rate self.n_iter = n_iter def sigmoid(self, z): return 1 / (1 + np.exp(-z)) def fit(self, X, y): m, n = X.shape self.theta = np.random.randn(n + 1) X_b = np.c_[np.ones((m, 1)), X] for _ in range(self.n_iter): gradients = X_b.T.dot(self.sigmoid(X_b.dot(self.theta)) - y) / m self.theta -= self.learning_rate * gradients def predict_proba(self, X): X_b = np.c_[np.ones((len(X), 1)), X] return self.sigmoid(X_b.dot(self.theta)) def predict(self, X): return (self.predict_proba(X) >= 0.5).astype(np.int)
# 创建模型并训练 model = LogisticRegression() model.fit(X, y)
# 预测 X_new = np.linspace(0, 3, 1000).reshape(-1, 1) y_proba = model.predict_proba(X_new)
# 可视化 plt.plot(X_new, y_proba, "g-", label="Virginica probability") plt.plot(X[y == 0], y[y == 0], "bs", label="Not Virginica") plt.plot(X[y == 1], y[y == 1], "g^", label="Virginica") plt.xlabel("Petal width (cm)") plt.ylabel("Probability") plt.legend() plt.show()
|
③决策树(集成学习的基础)节点,分支,打标签:(建树依据,剪枝)
计算方法:
基尼系数:向着基尼系数变小的方向优化决策树
信息熵:信息增益增加的方向优化决策树
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
| import numpy as np from sklearn.datasets import load_iris import matplotlib.pyplot as plt
# 生成样本数据 iris = load_iris() X = iris.data[:, 2:] # 花瓣长度和宽度 y = iris.target
# 决策树模型 class DecisionTree: def fit(self, X, y): from sklearn.tree import DecisionTreeClassifier self.tree = DecisionTreeClassifier(max_depth=2) self.tree.fit(X, y) def predict(self, X): return self.tree.predict(X)
# 创建模型并训练 model = DecisionTree() model.fit(X, y)
# 可视化 from sklearn.tree import plot_tree
plt.figure(figsize=(12, 8)) plot_tree(model.tree, filled=True, feature_names=iris.feature_names[2:], class_names=iris.target_names) plt.show()
|
④朴素贝叶斯分类器
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
| import numpy as np
class NaiveBayes: def fit(self, X, y): self.classes = np.unique(y) self.mean = np.zeros((len(self.classes), X.shape[1])) self.var = np.zeros((len(self.classes), X.shape[1])) self.priors = np.zeros(len(self.classes)) for idx, c in enumerate(self.classes): X_c = X[y == c] self.mean[idx, :] = X_c.mean(axis=0) self.var[idx, :] = X_c.var(axis=0) self.priors[idx] = X_c.shape[0] / X.shape[0] def _gaussian_density(self, class_idx, x): mean = self.mean[class_idx] var = self.var[class_idx] numerator = np.exp(- (x - mean) ** 2 / (2 * var)) denominator = np.sqrt(2 * np.pi * var) return numerator / denominator def _predict_single(self, x): posteriors = [] for idx, c in enumerate(self.classes): prior = np.log(self.priors[idx]) class_conditional = np.sum(np.log(self._gaussian_density(idx, x))) posterior = prior + class_conditional posteriors.append(posterior) return self.classes[np.argmax(posteriors)] def predict(self, X): return np.array([self._predict_single(x) for x in X])
# 生成样本数据 from sklearn.datasets import load_iris iris = load_iris() X, y = iris.data, iris.target
model = NaiveBayes() model.fit(X, y)
# 预测 y_pred = model.predict(X)
# 计算准确率 accuracy = np.mean(y_pred == y) print(f'Accuracy: {accuracy * 100:.2f}%')
|
6、结果可视化