import numpy as np import matplotlib.pyplot as plt plt.style.use('./ML/deeplearning.mplstyle') x_train=np.array([1.0,2.0]) y_train=np.array([300.0,500.0]) m=x_train.shape[0] #或者用m=len(x_train) # print(m) #2 # Plot the data points plt.scatter(x_train,y_train,c='r',marker='x',label='Actual Values') #标题 plt.title("Housing Prices") # Set the y-axis label plt.ylabel('Price (in 1000s of dollars)') # Set the x-axis label plt.xlabel('Size (1000 sqft)') # 显示图 plt.show() #设置一维参数并调整使最后的输出可以拟合 w=200 b=100 # 拟合函数输出 defcompute_model_output(x,w,b): """ Computes the prediction of a linear model Args: x (ndarray (m,)): Data, m examples w,b (scalar) : model parameters Returns y (ndarray (m,)): target values """ m = x.shape[0] f_wb = np.zeros_like(x) for i inrange(m): f_wb[i]=w*x[i]+b return f_wb # 绘制输出 tmp_f_wb = compute_model_output(x_train, w, b) # Plot our model prediction plt.plot(x_train, tmp_f_wb, c='b',label='Our Prediction')
# Plot the data points # plt.scatter(x_train, y_train, marker='x', c='r',label='Actual Values') # # Set the title # plt.title("Housing Prices") # # Set the y-axis label # plt.ylabel('Price (in 1000s of dollars)') # # Set the x-axis label # plt.xlabel('Size (1000 sqft)') # plt.legend() # plt.show() # Prediction并显示在图中 x_i = 1.2 cost_1200sqft = w * x_i + b print(f"${cost_1200sqft:.0f} thousand dollars") plt.scatter(x_i, cost_1200sqft, marker='x', c='green',s=80,label='Prediction Values') plt.legend() plt.show()
import numpy as np import matplotlib.pyplot as plt from lab_utils_uni import plt_intuition, plt_stationary, plt_update_onclick, soup_bowl plt.style.use('./deeplearning.mplstyle') x_train=np.array([1,2]) y_train=np.array([300,500]) defcompute_cost(x, y, w, b): """ Computes the cost function for linear regression. Args: x (ndarray (m,)): Data, m examples y (ndarray (m,)): target values w,b (scalar) : model parameters Returns total_cost (float): The cost of using w,b as the parameters for linear regression to fit the data points in x and y """ m=len(x) for i inrange(m): cost=x[i]*w+b-y[i] cost_sum+=cost**2 cost_total=(1 / (2 * m)) * cost_sum return cost_total plt_intuition(x_train,y_train) #更多数据 x_train = np.array([1.0, 1.7, 2.0, 2.5, 3.0, 3.2]) y_train = np.array([250, 300, 480, 430, 630, 730,]) plt.close('all') fig, ax, dyn_items = plt_stationary(x_train, y_train) updater = plt_update_onclick(fig, ax, x_train, y_train, dyn_items) soup_bowl()
线性回归的梯度下降
C1_W1_Lab05_Gradient_Descent_Soln fw,b(x(i)):
fw,b(x(i))=wx(i)+b(1)
J(w,b)=2m1i=0∑m−1(fw,b(x(i))−y(i))2(2)
\begin{align*} \text{repeat}&\text{ until convergence:} \; \lbrace \newline \; w &= w - \alpha \frac{\partial J(w,b)}{\partial w} \tag{3} \; \newline b &= b - \alpha \frac{\partial J(w,b)}{\partial b} \newline \rbrace \end{align*}
import math, copy import numpy as np import matplotlib.pyplot as plt plt.style.use('./deeplearning.mplstyle') from lab_utils_uni import plt_house_x, plt_contour_wgrad, plt_divergence, plt_gradients # Load our data set x_train = np.array([1.0, 2.0]) #features y_train = np.array([300.0, 500.0]) #target value #Function to calculate the cost defcompute_cost(x, y, w, b): m = x.shape[0] cost = 0 for i inrange(m): f_wb = w * x[i] + b cost = cost + (f_wb - y[i])**2 total_cost = 1 / (2 * m) * cost
return total_cost defcompute_gradient(x, y, w, b): """ Computes the gradient for linear regression Args: x (ndarray (m,)): Data, m examples y (ndarray (m,)): target values w,b (scalar) : model parameters Returns dj_dw (scalar): The gradient of the cost w.r.t. the parameters w dj_db (scalar): The gradient of the cost w.r.t. the parameter b """ m=x.shape[0] dj_dw=dj_db=0 for i inrange(m): dj_dw_temp=(1/m)*(w*x[i]+b-y[i])*x[i] dj_db_temp=(1/m)*(w*x[i]+b-y[i]) dj_dw+=dj_dw_temp dj_db+=dj_db_temp return dj_dw,dj_db plt_gradients(x_train,y_train, compute_cost, compute_gradient) plt.show() defgradient_descent(x, y, w_in, b_in, alpha, num_iters, cost_function, gradient_function): """ Performs batch gradient descent to fit w,b. Updates w,b by taking num_iters gradient steps with learning rate alpha Args: x (ndarray (m,)) : Data, m examples y (ndarray (m,)) : target values w_in,b_in (scalar): initial values of model parameters alpha (float): Learning rate num_iters (int): number of iterations to run gradient descent cost_function: function to call to produce cost gradient_function: function to call to produce gradient Returns: w (scalar): Updated value of parameter after running gradient descent b (scalar): Updated value of parameter after running gradient descent J_history (List): History of cost values p_history (list): History of parameters [w,b] """ w=w_in b=b_in J_history=[] p_history=[] for i inrange(num_iters): dj_dw,dj_db=gradient_function(x,y,w,b) w=w-alpha*dj_dw b=b-alpha*dj_db if i<=100000: J_history.append(cost_function(x,y,w,b)) p_history.append([w,b]) # Print cost every at intervals 10 times or as many iterations if < 10 if i% math.ceil(num_iters/10) == 0: print(print(f"Iteration {i:4}: Cost {J_history[-1]:0.2e} ", f"dj_dw: {dj_dw: 0.3e}, dj_db: {dj_db: 0.3e} ", f"w: {w: 0.3e}, b:{b: 0.5e}")) return w,b,J_history,p_history plt.close() w_i=0 b_i=0 internal=5000 alpha=0.02 w_f,b_f,J_h,p_h=gradient_descent(x_train,y_train,w_i,b_i,alpha,internal,compute_cost,compute_gradient) print(f'迭代完成后最终w={w_f},b={b_f}') # plot cost versus iteration fig, (ax1, ax2) = plt.subplots(1, 2, constrained_layout=True, figsize=(12,4)) ax1.plot(J_h) ax2.plot(1000 + np.arange(len(J_h[1000:])), J_h[1000:]) #第1000次开始的成本 a=1000 + np.arange(len(J_h[1000:])) b=J_h[1000:] a1=len(a); b1=len(b) print(f'第二张图的横坐标为{a},横纵坐标数组长度分别为{a1},{b1}') #打印第二张图的横坐标 ax1.set_title("Cost vs. iteration"); ax2.set_title("Cost vs. iteration (tail)") ax1.set_ylabel('Cost') ; ax2.set_ylabel('Cost') ax1.set_xlabel('iteration step') ; ax2.set_xlabel('iteration step') plt.show() # 使用迭代后的w,b预测 print(f"1000 sqft house prediction {w_f*1.0 + b_f:0.1f} Thousand dollars") print(f"1200 sqft house prediction {w_f*1.2 + b_f:0.1f} Thousand dollars") print(f"2000 sqft house prediction {w_f*2.0 + b_f:0.1f} Thousand dollars")
import copy, math import numpy as np import matplotlib.pyplot as plt plt.style.use('./deeplearning.mplstyle') np.set_printoptions(precision=2) # reduced display precision on numpy arrays X_train = np.array([[2104, 5, 1, 45], [1416, 3, 2, 40], [852, 2, 1, 35]]) y_train = np.array([460, 232, 178]) b_init = 785.1811367994083 w_init = np.array([ 0.39133535, 18.75376741, -53.36032453, -26.42131618]) print(X_train[0]) defpredict_single_loop(x, w, b): """ single predict using linear regression Args: x (ndarray): Shape (n,) example with multiple features w (ndarray): Shape (n,) model parameters b (scalar): model parameter Returns: p (scalar): prediction """ f_wb=np.dot(x,w)+b return f_wb x_0=X_train[0,:] Yp0=predict_single_loop(x_0,w_init,b_init) print(f'初始w,b下的多元一阶线性预测值:{Yp0}') defcompute_cost(X, y, w, b): """ compute cost Args: X (ndarray (m,n)): Data, m examples with n features y (ndarray (m,)) : target values w (ndarray (n,)) : model parameters b (scalar) : model parameter Returns: cost (scalar): cost """ m=len(X) f_wbi=0 cost1=0 for i inrange(m): f_wbi=predict_single_loop(X[i],w,b) cost0=(1/(2*m))*(f_wbi-y[i])**2 cost1=cost0+cost1 return cost1 cost_init=compute_cost(X_train,y_train,w_init,b_init) print(f'初始w,b下的多元一阶线性预测的成本为:{cost_init}') defcompute_gradient(X, y, w, b): """ Computes the gradient for linear regression Args: X (ndarray (m,n)): Data, m examples with n features y (ndarray (m,)) : target values w (ndarray (n,)) : model parameters b (scalar) : model parameter Returns: dj_dw (ndarray (n,)): The gradient of the cost w.r.t. the parameters w. dj_db (scalar): The gradient of the cost w.r.t. the parameter b. """ m,n = X.shape #(number of examples, number of features) dj_dw=np.zeros((n,)) dj_db=0 for i inrange(m): f_wbi=predict_single_loop(X[i],w,b) err=(1/m)*(f_wbi-y[i]) for j inrange(n): dj_dw[j]=err*X[i,j]+dj_dw[j] dj_db=err+dj_db return dj_dw,dj_db tmp_dj_dw, tmp_dj_db = compute_gradient(X_train, y_train, w_init, b_init) print(f'dj_db at initial w,b: {tmp_dj_db}') print(f'dj_dw at initial w,b: \n {tmp_dj_dw}') defgradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters): """ Performs batch gradient descent to learn theta. Updates theta by taking num_iters gradient steps with learning rate alpha Args: X (ndarray (m,n)) : Data, m examples with n features y (ndarray (m,)) : target values w_in (ndarray (n,)) : initial model parameters b_in (scalar) : initial model parameter cost_function : function to compute cost gradient_function : function to compute the gradient alpha (float) : Learning rate num_iters (int) : number of iterations to run gradient descent Returns: w (ndarray (n,)) : Updated values of parameters b (scalar) : Updated value of parameter """ w=copy.deepcopy(w_in);b=b_in J_h=[] for i inrange(num_iters): dj_dw, dj_db = gradient_function(X, y, w, b) w=w-alpha*dj_dw b=b-alpha*dj_db # Save cost J at each iteration if i<100000: # prevent resource exhaustion J_h.append( cost_function(X, y, w, b))
# Print cost every at intervals 10 times or as many iterations if < 10 if i% math.ceil(num_iters / 10) == 0: print(f"Iteration {i:4d}: Cost {J_h[-1]:8.2f} ") return w, b, J_h #return final w,b and J history for graphing # initialize parameters initial_w = np.zeros_like(w_init) initial_b = 0. # some gradient descent settings iterations = 9000 alpha = 5.0e-7 # run gradient descent w_final, b_final, J_hist = gradient_descent(X_train, y_train, initial_w, initial_b, compute_cost, compute_gradient, alpha, iterations) print(f"b,w found by gradient descent: {b_final:0.2f},{w_final} ") m,_ = X_train.shape # print(X_train[i]) for i inrange(m): print(f"prediction: {np.dot(X_train[i], w_final) + b_final:0.2f}, target value: {y_train[i]}") # plot cost versus iteration fig, (ax1, ax2) = plt.subplots(1, 2, constrained_layout=True, figsize=(12, 4)) ax1.plot(J_hist) ax2.plot(100 + np.arange(len(J_hist[100:])), J_hist[100:]) ax1.set_title("Cost vs. iteration"); ax2.set_title("Cost vs. iteration (tail)") ax1.set_ylabel('Cost') ; ax2.set_ylabel('Cost') ax1.set_xlabel('iteration step') ; ax2.set_xlabel('iteration step') plt.show()
#This version saves more values and is more verbose than the assigment versons defgradient_descent_houses(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters): """ Performs batch gradient descent to learn theta. Updates theta by taking num_iters gradient steps with learning rate alpha Args: X : (array_like Shape (m,n) matrix of examples y : (array_like Shape (m,)) target value of each example w_in : (array_like Shape (n,)) Initial values of parameters of the model b_in : (scalar) Initial value of parameter of the model cost_function: function to compute cost gradient_function: function to compute the gradient alpha : (float) Learning rate num_iters : (int) number of iterations to run gradient descent Returns w : (array_like Shape (n,)) Updated values of parameters of the model after running gradient descent b : (scalar) Updated value of parameter of the model after running gradient descent """ # number of training examples m = len(X) # An array to store values at each iteration primarily for graphing later hist={} hist["cost"] = []; hist["params"] = []; hist["grads"]=[]; hist["iter"]=[]; w = copy.deepcopy(w_in) #avoid modifying global w within function b = b_in save_interval = np.ceil(num_iters/10000) # prevent resource exhaustion for long runs
# Calculate the gradient and update the parameters dj_db,dj_dw = gradient_function(X, y, w, b)
# Update Parameters using w, b, alpha and gradient w = w - alpha * dj_dw b = b - alpha * dj_db # Save cost J,w,b at each save interval for graphing if i == 0or i % save_interval == 0: hist["cost"].append(cost_function(X, y, w, b)) hist["params"].append([w,b]) hist["grads"].append([dj_dw,dj_db]) hist["iter"].append(i)
# Print cost every at intervals 10 times or as many iterations if < 10 if i% math.ceil(num_iters/10) == 0: #print(f"Iteration {i:4d}: Cost {cost_function(X, y, w, b):8.2f} ") cst = cost_function(X, y, w, b) print(f"{i:9d}{cst:0.5e}{w[0]: 0.1e}{w[1]: 0.1e}{w[2]: 0.1e}{w[3]: 0.1e}{b: 0.1e}{dj_dw[0]: 0.1e}{dj_dw[1]: 0.1e}{dj_dw[2]: 0.1e}{dj_dw[3]: 0.1e}{dj_db: 0.1e}") return w, b, hist #return w,b and history for graphing
ax[2].scatter(X_norm[:,0], X_norm[:,3]) ax[2].set_xlabel(X_features[0]); ax[0].set_ylabel(X_features[3]); ax[2].set_title(r"Z-score normalized") ax[2].axis('equal') plt.tight_layout(rect=[0, 0.03, 1, 0.95]) fig.suptitle("distribution of features before, during, after normalization") plt.show() """ 上图显示了两个训练集参数“age”和“sqft”之间的关系。这些以相同的比例绘制。 左:非规范化:“size(sqft)”特征的值范围或方差远大于年龄范围 中间:第一步查找会移除每个要素的平均值或平均值。这将留下以零为中心的要素。很难看出“年龄”功能的差异,但“size(sqft)”显然在零附近。 右:第二步除以方差。这使得两个要素都以零为中心,比例相似 """ # normalize the original features # X_norm, X_mu, X_sigma = zscore_normalize_features(X_train) print(f"X_mu = {mu}, \nX_sigma = {sigma}") print(f"Peak to Peak range by column in Raw X:{np.ptp(X_train,axis=0)}") print(f"Peak to Peak range by column in Normalized X:{np.ptp(X_norm,axis=0)}")
fig,ax=plt.subplots(1, 4, figsize=(12, 3)) for i inrange(len(ax)): norm_plot(ax[i],X_train[:,i],) ax[i].set_xlabel(X_features[i]) ax[0].set_ylabel("count"); fig.suptitle("distribution of features before normalization") plt.show() fig,ax=plt.subplots(1,4,figsize=(12,3)) for i inrange(len(ax)): norm_plot(ax[i],X_norm[:,i],) ax[i].set_xlabel(X_features[i]) ax[0].set_ylabel("count"); fig.suptitle(f"distribution of features after normalization") plt.show()
w_norm, b_norm, hist = run_gradient_descent(X_norm, y_train, 1000, 1.0e-1, ) #predict target using normalized features m=X_norm.shape[0];yp=np.zeros(m) for i inrange(m): yp[i]=np.dot(X_norm[i],w_norm)+b_norm # plot predictions and targets versus original features fig,ax=plt.subplots(1,4,figsize=(12,3),sharey=True) for i inrange(len(ax)): ax[i].scatter(X_norm[:,i],y_train,label='target') ax[i].scatter(X_norm[:,i],yp,label='predict',c=dlorange) ax[i].set_xlabel(X_features[i]) ax[0].set_ylabel("Price"); ax[0].legend(); fig.suptitle("target versus prediction using z-score normalized model") plt.show() # First, normalize out example. x_house = np.array([1200, 3, 1, 40]) x_house_norm,_,_=zscore_normalize_features(x_house) x_house_predict = np.dot(x_house_norm, w_norm) + b_norm print(f'该房屋的预测价格为{x_house_predict}') plt_equal_scale(X_train, X_norm, y_train)
特征工程和多项式回归
基于C1_W2_Lab04_FeatEng_PolyReg_Soln本例适用于要素/数据是非线性的组合 Python中的“ @”(@)符号有什么作用?简而言之,它用于装饰器语法和矩阵乘法。 a @ b相当于dot(a, b) numpy中reshapel函数的三种常见相关用法 reshape(1,-1)转化成1行: reshape(2,-1)转换成两行: reshape(-1,1)转换成1列: reshape(-1,2)转化成两列 在科学记数法中,一个数被写成一个1与10之间的实数(尾数)与一个10的幂的积,为了得到统一的表达方式,该尾数并不包括10: 782300=7.823×10^5 0.00012=1.2×10^(−4) 10000=1×10^4 在电脑或计算器中一般用E或e(英语Exponential)来表示10的幂:比如1e-6表示1乘以10的负6次方 7.823E5=782300 1.2e−4=0.00012 若用一般的方法,将一个数的所有数位都写出,在表示非常大或非常小的数时,将难以清楚知道它的大小,有时亦会浪费很多空间。使用科学记数法写的数的数量级、精确度和数值都非常明确。 阅读代码这里的梯度下降X是要二维数组而y为一维需要变换一下 X : (array_like Shape (m,n) matrix of examples y : (array_like Shape (m,)) target value of each example array_like Shape (m,n)是二维,array_like Shape (m,)是一维 好吧,正如预期的那样,不太合适。需要的是类似的东西 或多项式要素。 为此,您可以修改输入数据以设计所需的要素。如果将原始数据交换为将 价值,那么你可以实现 .让我们试试吧。换成以下:X=X**2也就是等价替换降维为一维线性,发现曲线几乎重合 np.c_ 用于连接两个矩阵 使用model_w,model_b = run_gradient_descent_feng(X, y1, iterations=9000, alpha=1e-5)发现报错 RuntimeWarning: overflow encountered in scalar add cost = cost + (f_wb_i - y[i])**2 原因是alpha数据不对导致计算式数值溢出,且对于高阶数据别忘了进行归一化处理以防止溢出特别是在拟合非线性曲线的时候 在w_z,b_z=run_gradient_descent_feng(X1, y1, iterations=100000, alpha=1e-7)中使用alpha=1e-7由于值太小导致1W次迭代也难以下降成本换成alpha=1e-1曲线拟合,足以说明alpha值的重要性 总体代码
import numpy as np # %matplotlib widget import matplotlib.pyplot as plt plt.style.use('./deeplearning.mplstyle') plt.rcParams['font.size'] = 8 from lab_utils_common import plot_data, sigmoid, dlc import numpy as np X = np.array([[0.5, 1.5], [1,1], [1.5, 0.5], [3, 0.5], [2, 2], [1, 2.5]]) y = np.array([0, 0, 0, 1, 1, 1]) fig,ax = plt.subplots(1,1,figsize=(4,4)) plot_data(X, y, ax)
# Set both axes to be from 0-4 ax.axis([0, 4, 0, 3.5]) ax.set_ylabel('$x_1$', fontsize=12) ax.set_xlabel('$x_0$', fontsize=12) plt.show() defcompute_cost_logistic(X, y, w, b): """ Computes cost Args: X (ndarray): Shape (m,n) matrix of examples with n features y (ndarray): Shape (m,) target values w (ndarray): Shape (n) parameters for prediction b (scalar): parameter for prediction Returns: cost (scalar): cost """ m=X.shape[0] loss=0 for i inrange(m): gz=sigmoid(np.dot(w,X[i])+b) loss_i=-y[i]*np.log(gz)-(1-y[i])*np.log(1-gz) loss+=loss_i loss=(1/m)*loss return loss # 检查测试 w=np.array([1,1]);b=-3;print(y.shape,w.shape,w) print(compute_cost_logistic(X, y, w, b))
m is the number of training examples in the data set
fw,b(x(i)) is the model's prediction, whiley(i) is the target
对于逻辑回归模型 z=w⋅x+b fw,b(x)=g(z) whereg(z) is the sigmoid function: g(z)=1+e−z1 The gradient descent algorithm implementation has two components:
The loop implementing equation (1) above. This is gradient_descent below and is generally provided to you in optional and practice labs.
The calculation of the current gradient, equations (2,3) above. This is compute_gradient_logistic below. You will be asked to implement this weeks practice lab. Implements equation (2),(3) above for allwj andb. There are many ways to implement this. Outlined below is this:
initialize variables to accumulate dj_dw and dj_db
for each example
calculate the error for that exampleg(w⋅x(i)+b)−y(i)
for each input valuexj(i) in this example,
multiply the error by the inputxj(i), and add to the corresponding element of dj_dw. (equation 2 above)
add the error to dj_db (equation 3 above)
divide dj_db and dj_dw by total number of examples (m)
note thatx(i) in numpy X[i,:] or X[i] andxj(i) is X[i,j] 在求偏导的compute_gradient_logistic子函数中对dj_dw: (ndarray Shape (n,))用dj_dw=np.zeros_like(n)报错,用dj_dw = np.zeros((n,))正常,经查前者生成的是0维数据(),后者是一维数组(n,)因此前者维度不匹配报错,或者用np.zeros_like(X[0])即可生成于一维数组(n,)这里的n是X的列数 梯度下降代码 下面的代码实现公式(1)如下。花点时间查找例程中的函数并将其与上面的等式进行比较 在此子函数中使用for j in range(n): w[j]=w[j]-alpha*dj_dw多此一举,因为上文中返回的dj_dw本身就是一维数组(n,)类型用w=w-alpha*dj_dw就可以一次更新所有的w[j]j从0-n-1,if i % math.ceil(num_iters/10)==0:和if num_iters<=100000:是要在for循环以内的这样才能实时计算保存cost和输出 绘制分界线,因为z=w0*x0+w1*x1 以z=0为临界线,所以w0*x0+w1*x1=0分别令x0=0,x1=0,在(x0,x1)二维平面中得到两点用plot绘制两点间的直线连接即可 能计算二维X (ndarray): Shape (m,n)的梯度下降函数也能用于计算一维,将一维x_train = np.array([0., 1, 2, 3, 4, 5])转换为二维一列的列数组即可,列数为1是因为特征只有一个x_train=x_train.reshape(-1,1) 总代码
import numpy as np # %matplotlib widget import matplotlib.pyplot as plt plt.style.use('./deeplearning.mplstyle') plt.rcParams['font.size'] = 8 import copy, math from lab_utils_common import dlc, plot_data, plt_tumor_data, sigmoid, compute_cost_logistic from plt_quad_logistic import plt_quad_logistic, plt_prob #让我们从决策边界实验室中使用的相同两个特征数据集开始。 X_train = np.array([[0.5, 1.5], [1,1], [1.5, 0.5], [3, 0.5], [2, 2], [1, 2.5]]) y_train = np.array([0, 0, 0, 1, 1, 1]) #和以前一样,我们将 # pos = y_train == 1 # neg = y_train == 0 # pos = pos.reshape(-1,) #work with 1D or 1D y vectors # neg = neg.reshape(-1,) # print(pos,neg,pos.shape,neg.shape) fig,ax=plt.subplots(1,1,figsize=(6,6)) plot_data(X_train,y_train,ax) ax.set_xlabel('$x_0$');ax.set_ylabel('$x_1$'); plt.show() defcompute_gradient_logistic(X, y, w, b): """ Computes the gradient for linear regression Args: X : (ndarray Shape (m,n)) variable such as house size y : (ndarray Shape (m,)) actual value w : (ndarray Shape (n,)) parameters of the model b : (scalar) parameter of the model Returns dj_dw: (ndarray Shape (n,)) The gradient of the cost w.r.t. the parameters w. dj_db: (scalar) The gradient of the cost w.r.t. the parameter b. """ m,n=X.shape dj_db=0 dj_dw=np.zeros_like(X[0]) # 或dj_dw = np.zeros((n,)) for i inrange(m): fwb_i=sigmoid(np.dot(X[i],w)+b) err=fwb_i-y[i] dj_db+=(1/m)*err for j inrange(n): dj_dw[j]+=(1/m)*err*X[i,j] return dj_dw,dj_db # 测试 X_tmp = np.array([[0.5, 1.5], [1,1], [1.5, 0.5], [3, 0.5], [2, 2], [1, 2.5]]) y_tmp = np.array([0, 0, 0, 1, 1, 1]) w = np.array([2.,3.]) b = 1. dj_dw, dj_db = compute_gradient_logistic(X_tmp, y_tmp, w, b) print(f"dj_db, non-vectorized version: {dj_db}" ) print(f"dj_dw, non-vectorized version: {dj_dw.tolist()}" ) dj_dw1=np.zeros_like(2);dj_dw2=np.zeros(2);print(dj_dw1,dj_dw2,dj_dw1.shape,dj_dw2.shape) defgradient_descent(X, y, w_in, b_in, alpha, num_iters): """ Performs batch gradient descent Args: X (ndarray): Shape (m,n) matrix of examples y (ndarray): Shape (m,) target value of each example w_in (ndarray): Shape (n,) Initial values of parameters of the model b_in (scalar): Initial value of parameter of the model alpha (float): Learning rate num_iters (int): number of iterations to run gradient descent Returns: w (ndarray): Shape (n,) Updated values of parameters b (scalar): Updated value of parameter """ w=copy.deepcopy(w_in) b=b_in m,n=X.shape J_history = [] #存储成本 for i inrange(num_iters): dj_dw, dj_db = compute_gradient_logistic(X, y, w, b) b=b-alpha*dj_db w=w-alpha*dj_dw if num_iters<=100000: J_history.append(compute_cost_logistic(X, y, w, b)) if i % math.ceil(num_iters/10)==0: print(f"Iteration {i:4d}: Cost {J_history[-1]} ") return w, b, J_history, #return final w,b and J history for graphing w_in = np.zeros_like(X_train[0]) b_in = 0. alpha = 0.1 num_iters = 10000 w_out, b_out, _ = gradient_descent(X_train, y_train, w_in, b_in, alpha, num_iters) print(f"\nupdated parameters: w:{w_out}, b:{b_out}") fig,ax = plt.subplots(1,1,figsize=(5,4)) # plot the probability plt_prob(ax, w_out, b_out)
正则化可修复过度拟合 在本实验中: 使用正则化项扩展以前的线性和逻辑成本函数。 重新运行前面的过度拟合示例,并添加了正则化项。 上面的幻灯片显示了线性回归和逻辑回归的成本和梯度函数。注意: 成本 线性回归和逻辑回归之间的成本函数差异很大,但向方程添加正则化是相同的。 梯度 线性回归和逻辑回归的梯度函数非常相似。They differ only in the implementation offwb. 正则化线性回归的成本函数 成本函数正则化线性回归的公式为: The equation for the cost function regularized linear regression is:
Compare this to the cost function without regularization (which you implemented in a previous lab), which is of the form:
J(w,b)=2m1i=0∑m−1(fw,b(x(i))−y(i))2
The difference is the regularization term, 2mλ∑j=0n−1wj2 包括这个术语激励梯度下降以最小化参数的大小。请注意,在此示例中,参数b 未规范化。这是标准做法。 下面是等式(1)和(2)的实现。请注意,这在本课程中使用了标准模式,即所有示例。for loop m 这里的正则项和成本函数是并行相加不要写嵌套循环,不要把j嵌套在i内这样会多加i倍的正则项,线性回归的fwb不要用sigmoid又不是逻辑回归 有一个经典错误for j in range(n): J_cost1+=J_cost0+(lambda_/(2*m))*(w[j]**2)这个语句会把J_cost0加j次而实际只要加一次 正则化逻辑回归的成本函数 对于正则化逻辑回归,成本函数的形式为
As was the case in linear regression above, the difference is the regularization term, which is 2mλ∑j=0n−1wj2
Including this term incentives gradient descent to minimize the size of the parameters. Note, in this example, the parameterb is not regularized. This is standard practice. 梯度下降与正则化 运行梯度下降的基本算法不会随着正则化而改变,它是:
Where each iteration performs simultaneous updates onwj for allj.
What changes with regularization is computing the gradients. The gradient calculation for both linear and logistic regression are nearly identical, differing only in computation offwb.
m is the number of training examples in the data set
fw,b(x(i)) is the model's prediction, whiley(i) is the target
For a linear regression model fw,b(x)=w⋅x+b
For a logistic regression model z=w⋅x+b fw,b(x)=g(z) whereg(z) is the sigmoid function: g(z)=1+e−z1
The term which adds regularization is the $\frac{\lambda}{m} w_j $. 在def compute_gradient_linear_reg(X, y, w, b, lambda_): 中语句dj_dw+=(1/m)*(fwb_i-y[i])*X[i,j]+(lambda_/m)*w[j]dj_dw应该用dj_dw[j]如果没有下标他会把所有项相加,后一项+(lambda_/m)*w[j]不应该嵌套在i的循环内因为没有累加符号 总代码
import numpy as np import matplotlib.pyplot as plt from plt_overfit import overfit_example, output from lab_utils_common import sigmoid np.set_printoptions(precision=8)
defcompute_cost_linear_reg(X, y, w, b, lambda_ = 1): """ Computes the cost over all examples Args: X (ndarray (m,n): Data, m examples with n features y (ndarray (m,)): target values w (ndarray (n,)): model parameters b (scalar) : model parameter lambda_ (scalar): Controls amount of regularization Returns: total_cost (scalar): cost """ m,n=X.shape;fwb_i=J_cost0=J_cost1=0 for i inrange(m): fwb_i=np.dot(X[i],w)+b J_cost0+=(1/(2*m))*((fwb_i-y[i])**2) for j inrange(n): J_cost1+=(lambda_/(2*m))*(w[j]**2) total_cost=J_cost1+J_cost0 return total_cost # 测试 np.random.seed(1) X_tmp = np.random.rand(5,6) y_tmp = np.array([0,1,0,1,0]) w_tmp = np.random.rand(X_tmp.shape[1]).reshape(-1,)-0.5 b_tmp = 0.5 lambda_tmp = 0.7 cost_tmp = compute_cost_linear_reg(X_tmp, y_tmp, w_tmp, b_tmp, lambda_tmp)
print("Regularized cost:", cost_tmp) defcompute_cost_logistic_reg(X, y, w, b, lambda_ = 1): """ Computes the cost over all examples Args: Args: X (ndarray (m,n): Data, m examples with n features y (ndarray (m,)): target values w (ndarray (n,)): model parameters b (scalar) : model parameter lambda_ (scalar): Controls amount of regularization Returns: total_cost (scalar): cost """ m,n=X.shape;Jz_cost0=Jz_cost1=0 for i inrange(m): gz=sigmoid(np.dot(X[i],w)+b) Jz_cost0+=(1/m)*(-y[i]*np.log(gz)-(1-y[i])*np.log(1-gz)) for j inrange(n): Jz_cost1+=(lambda_/(2*m))*(w[j]**2) total_cost=Jz_cost0+Jz_cost1 return total_cost #测试 np.random.seed(1) X_tmp = np.random.rand(5,6) y_tmp = np.array([0,1,0,1,0]) w_tmp = np.random.rand(X_tmp.shape[1]).reshape(-1,)-0.5 b_tmp = 0.5 lambda_tmp = 0.7 cost_tmp = compute_cost_logistic_reg(X_tmp, y_tmp, w_tmp, b_tmp, lambda_tmp) print("Regularized cost:", cost_tmp) #用于正则化线性回归的梯度函数 defcompute_gradient_linear_reg(X, y, w, b, lambda_): """ Computes the gradient for linear regression Args: X (ndarray (m,n): Data, m examples with n features y (ndarray (m,)): target values w (ndarray (n,)): model parameters b (scalar) : model parameter lambda_ (scalar): Controls amount of regularization Returns: dj_dw (ndarray (n,)): The gradient of the cost w.r.t. the parameters w. dj_db (scalar): The gradient of the cost w.r.t. the parameter b. """ m,n=X.shape dj_db=fwb_i=0 dj_dw=np.zeros((n,)) for i inrange(m): fwb_i=np.dot(X[i],w)+b dj_db+=(1/m)*(fwb_i-y[i]) for j inrange(n): dj_dw[j]+=(1/m)*(fwb_i-y[i])*X[i,j] for j inrange(n): dj_dw[j]+=(lambda_/m)*w[j] return dj_dw,dj_db
defeval_mse(y, yhat): """ Calculate the mean squared error on a data set. Args: y : (ndarray Shape (m,) or (m,1)) target value of each example yhat : (ndarray Shape (m,) or (m,1)) predicted value of each example Returns: err: (scalar) """ m = len(y) err = 0.0 for i inrange(m): ### START CODE HERE ### err_i=(1/(2*m))*(y[i]-yhat[i])**2 err+=err_i ### END CODE HERE ### return(err)
defgen_data(m, seed=1, scale=0.7): """ generate a data set based on a x^2 with added noise """ c = 0 x_train = np.linspace(0,49,m) np.random.seed(seed) y_ideal = x_train**2 + c y_train = y_ideal + scale * y_ideal*(np.random.sample((m,))-0.5) x_ideal = x_train #for redraw when new data included in X return x_train, y_train, x_ideal, y_ideal
import numpy as np # %matplotlib widget import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression, Ridge from sklearn.preprocessing import StandardScaler, PolynomialFeatures from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense from tensorflow.keras.activations import relu,linear from tensorflow.keras.losses import SparseCategoricalCrossentropy from tensorflow.keras.optimizers import Adam
tf.keras.backend.set_floatx('float64') from assigment_utils import *
tf.autograph.set_verbosity(0) # Generate some data X,y,x_ideal,y_ideal = gen_data(18, 2, 0.7) print("X.shape", X.shape, "y.shape", y.shape,"y_ideal",y_ideal) print(x_ideal,y_ideal) #split the data using sklearn routine X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=1) print("X_train.shape", X_train.shape, "y_train.shape", y_train.shape) print("X_test.shape", X_test.shape, "y_test.shape", y_test.shape)
fig,ax=plt.subplots(1,1,figsize=(4,4)) ax.plot(x_ideal, y_ideal, "--", color = "orangered", label="y_ideal", lw=1) ax.set_title("Training, Test",fontsize = 14) ax.set_xlabel("x") ax.set_ylabel("y") ax.scatter(X_train, y_train, color = "red", label="train") ax.scatter(X_test, y_test, color = dlc["dlblue"], label="test") ax.legend(loc='upper left') plt.show() # UNQ_C1 # GRADED CELL: eval_mse defeval_mse(y, yhat): """ Calculate the mean squared error on a data set. Args: y : (ndarray Shape (m,) or (m,1)) target value of each example yhat : (ndarray Shape (m,) or (m,1)) predicted value of each example Returns: err: (scalar) """ m = len(y) err = 0.0 for i inrange(m): ### START CODE HERE ### err_i=(1/(2*m))*(y[i]-yhat[i])**2 err+=err_i ### END CODE HERE ### return(err) #比较训练和测试数据的性能 # create a model in sklearn, train on training data degree = 10 lmodel = lin_model(degree) #生成最高次数为10次的模型 lmodel.fit(X_train, y_train)
# predict on training data, find training error yhat = lmodel.predict(X_train) err_train = lmodel.mse(y_train, yhat)
defcompute_entropy(y): """ Computes the entropy for Args: y (narray): Numpy一维数组,表示节点上的每个示例是否为 可食用(' 1 ')或有毒(' 0 ') Returns: entropy (float):该节点的熵 """ # You need to return the following variables correctly entropy = 0. l=len(y) ### START CODE HERE ### #统计y数组中1的个数 if l!=0: t0=np.sum(np.where(y,0,1)) if t0==0or t0==l: entropy=0 else: t1=l-t0 p1=t1/l entropy=-p1*np.log2(p1)-(1-p1)*np.log2(1-p1) ### END CODE HERE ### return entropy
defcompute_information_gain(X, y, node_indices, feature): """ Compute the information of splitting the node on a given feature Args: X (ndarray): Data matrix of shape(n_samples, n_features) y (array like): list or ndarray with n_samples containing the target variable node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step. Returns: cost (float): Cost computed """ # Split dataset left_indices, right_indices = split_dataset(X, node_indices, feature) # Some useful variables X_node, y_node = X[node_indices], y[node_indices] X_left, y_left = X[left_indices], y[left_indices] X_right, y_right = X[right_indices], y[right_indices] # You need to return the following variables correctly information_gain = 0 ### START CODE HERE ### # Weights w_left=len(X_left)/len(X_node) w_right=len(X_right)/len(X_node) HP1_node=compute_entropy(y_node) HP1_left=compute_entropy(y_left) HP1_right=compute_entropy(y_right) #Weighted entropy #Information gain information_gain = HP1_node-(w_left*HP1_left+w_right*HP1_right) ### END CODE HERE ### return information_gain
defget_best_split(X, y, node_indices): """ Returns the optimal feature and threshold value to split the node data Args: X (ndarray): Data matrix of shape(n_samples, n_features) y (array like): list or ndarray with n_samples containing the target variable node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step. Returns: best_feature (int): The index of the best feature to split """ # Some useful variables num_features = X.shape[1] best_i=[] # You need to return the following variables correctly best_feature = -1 ### START CODE HERE ### for i inrange(num_features): best_i.append(compute_information_gain(X,y,node_indices,i)) best_feature=best_i.index(max(best_i)) ### END CODE HERE ## return best_feature
import numpy as np import matplotlib.pyplot as plt from public_tests import *
X_train = np.array([[1,1,1],[1,0,1],[1,0,0],[1,0,0],[1,1,1],[0,1,1],[0,0,0],[1,0,1],[0,1,0],[1,0,0]]) y_train = np.array([1,1,0,0,1,0,0,1,1,0]) print("First few elements of X_train:\n", X_train[:5,:]) print("Type of X_train:",type(X_train)) print("First few elements of y_train:", y_train[:5]) print("Type of y_train:",type(y_train)) print ('Number of training examples (m):', len(X_train))
# UNQ_C1 # GRADED FUNCTION: compute_entropy
defcompute_entropy(y): """ Computes the entropy for Args: y (narray): Numpy一维数组,表示节点上的每个示例是否为 可食用(' 1 ')或有毒(' 0 ') Returns: entropy (float):该节点的熵 """ # You need to return the following variables correctly entropy = 0. l=len(y) ### START CODE HERE ### #统计y数组中1的个数 if l!=0: t0=np.sum(np.where(y,0,1)) if t0==0or t0==l: entropy=0 else: t1=l-t0 p1=t1/l entropy=-p1*np.log2(p1)-(1-p1)*np.log2(1-p1) ### END CODE HERE ### return entropy # Compute entropy at the root node (i.e. with all examples) # Since we have 5 edible and 5 non-edible mushrooms, the entropy should be 1"
print("Entropy at root node: ", compute_entropy(y_train))
# UNIT TESTS compute_entropy_test(compute_entropy)
# UNQ_C2 # GRADED FUNCTION: split_dataset
defsplit_dataset(X, node_indices, feature): """ Splits the data at the given node into left and right branches Args: X (ndarray): Data matrix of shape(n_samples, n_features) node_indices (ndarray): 包含活动索引的列表。即,在此步骤中考虑的样本。 feature (int): Index of feature to split on s所要分割元素的索引 Returns: left_indices (ndarray): Indices with feature value == 1 right_indices (ndarray): Indices with feature value == 0 """ # You need to return the following variables correctly left_indices = [] right_indices = [] ### START CODE HERE ### for i in node_indices: if X[i][feature]==1: left_indices.append(i) elif X[i][feature]==0: right_indices.append(i) ### END CODE HERE ### return left_indices, right_indices root_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
# Feel free to play around with these variables # The dataset only has three features, so this value can be 0 (Brown Cap), 1 (Tapering Stalk Shape) or 2 (Solitary) feature = 0
# UNIT TESTS split_dataset_test(split_dataset) # UNQ_C3 # GRADED FUNCTION: compute_information_gain
defcompute_information_gain(X, y, node_indices, feature): """ Compute the information of splitting the node on a given feature Args: X (ndarray): Data matrix of shape(n_samples, n_features) y (array like): list or ndarray with n_samples containing the target variable node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step. Returns: cost (float): Cost computed """ # Split dataset left_indices, right_indices = split_dataset(X, node_indices, feature) # Some useful variables X_node, y_node = X[node_indices], y[node_indices] X_left, y_left = X[left_indices], y[left_indices] X_right, y_right = X[right_indices], y[right_indices] # You need to return the following variables correctly information_gain = 0 ### START CODE HERE ### # Weights w_left=len(X_left)/len(X_node) w_right=len(X_right)/len(X_node) HP1_node=compute_entropy(y_node) HP1_left=compute_entropy(y_left) HP1_right=compute_entropy(y_right) #Weighted entropy #Information gain information_gain = HP1_node-(w_left*HP1_left+w_right*HP1_right) ### END CODE HERE ### return information_gain info_gain0 = compute_information_gain(X_train, y_train, root_indices, feature=0) print("Information Gain from splitting the root on brown cap: ", info_gain0) info_gain1 = compute_information_gain(X_train, y_train, root_indices, feature=1) print("Information Gain from splitting the root on tapering stalk shape: ", info_gain1)
info_gain2 = compute_information_gain(X_train, y_train, root_indices, feature=2) print("Information Gain from splitting the root on solitary: ", info_gain2)
# UNIT TESTS compute_information_gain_test(compute_information_gain)
# UNQ_C4 # GRADED FUNCTION: get_best_split
defget_best_split(X, y, node_indices): """ Returns the optimal feature and threshold value to split the node data Args: X (ndarray): Data matrix of shape(n_samples, n_features) y (array like): list or ndarray with n_samples containing the target variable node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step. Returns: best_feature (int): The index of the best feature to split """ # Some useful variables num_features = X.shape[1] best_i=[] # You need to return the following variables correctly best_feature = -1 ### START CODE HERE ### for i inrange(num_features): best_i.append(compute_information_gain(X,y,node_indices,i)) best_feature=best_i.index(max(best_i)) ### END CODE HERE ## return best_feature best_feature = get_best_split(X_train, y_train, root_indices) print("Best feature to split on: %d" % best_feature)
# UNIT TESTS 以上代码测试通过 # get_best_split_test(get_best_split) # Not graded tree = []
defbuild_tree_recursive(X, y, node_indices, branch_name, max_depth, current_depth): """ Build a tree using the recursive algorithm that split the dataset into 2 subgroups at each node. This function just prints the tree. Args: X (ndarray): Data matrix of shape(n_samples, n_features) y (array like): list or ndarray with n_samples containing the target variable node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step. branch_name (string): Name of the branch. ['Root', 'Left', 'Right'] max_depth (int): Max depth of the resulting tree. current_depth (int): Current depth. Parameter used during recursive call. """
# Maximum depth reached - stop splitting if current_depth == max_depth: formatting = " "*current_depth + "-"*current_depth print(formatting, "%s leaf node with indices" % branch_name, node_indices) return#直接退出函数 # Otherwise, get best split and split the data # Get the best feature and threshold at this node best_feature = get_best_split(X, y, node_indices) tree.append((current_depth, branch_name, best_feature, node_indices)) formatting = "-"*current_depth print("%s Depth %d, %s: Split on feature: %d" % (formatting, current_depth, branch_name, best_feature)) # Split the dataset at the best feature left_indices, right_indices = split_dataset(X, node_indices, best_feature) # continue splitting the left and the right child. Increment current depth build_tree_recursive(X, y, left_indices, "Left", max_depth, current_depth+1) build_tree_recursive(X, y, right_indices, "Right", max_depth, current_depth+1) build_tree_recursive(X_train, y_train, root_indices, "Root", max_depth=2, current_depth=0)
K 均值聚类
1 - 实现 K 均值 K-means算法是一种自动聚类以区分相似物的方法数据点在一起。 具体来说,你得到一个训练集{x(1),...,x(m)},你想要 将数据分组到几个有凝聚力的“集群”中。 K 均值是一个迭代过程,它 首先随机初始质心,然后细化此猜测 重复将示例分配给其最近的质心,然后 根据分配重新计算质心。 在伪代码中,K 均值算法如下所示: # Initialize centroids # K is the number of clusters centroids = kMeans_init_centroids(X, K)
for iter in range(iterations): # Cluster assignment step: # Assign each data point to the closest centroid. # idx[i] corresponds to the index of the centroid # assigned to example i idx = find_closest_centroids(X, centroids)
# Move centroid step:
# Compute means based on centroid assignments
centroids = compute_means(X, idx, K)
算法的内循环重复执行两个步骤: (i) 将每个训练样本x(i) 分配给其最近的质心 (ii) 使用分配给它的点重新计算每个质心的平均值。 K均值算法将始终收敛到质心的一组最终均值。 但是,收敛解可能并不总是理想的即有可能出现局部最优而不是整体最优,并且取决于质心的初始设置。 因此,在实践中,K-means算法通常使用不同的随机初始化运行几次。 从不同的随机初始化中选择这些不同解决方案的一种方法是选择具有最低成本函数值(失真)的解。 1.1 寻找最近的质心 在 K 均值算法的“聚类分配”阶段, 算法将每个训练样本x(i) 分配给其最接近的 质心,给定质心的当前位置。 习题1 您的任务找到距离数据集最近的质心。find_closest_centroids 此函数获取数据矩阵和所有位置,内部质心Xcentroids 它应该输出一个一维数组(具有与 相同数量的元素),该数组保存每个训练示例的最接近质心的索引(以{1,...,K} 为单位的值,其中K 是质心总数)。 idxX 具体来说,对于我们设置的每个示例x(i) $$c^{(i)} := j \quad \mathrm{that ; minimizes} \quad ||x^{(i)} - \mu_j||^2,$$ 哪里 c(i) 是最接近x(i) 的质心索引(对应于起始代码中的 ),并且idx[i] μj 是第j'th 质心的位置(值)。(存储在起始代码中)centroids 如果您遇到困难,可以查看下面单元格后面显示的提示,以帮助您实现。 这里utils的loaddata和之前的文件不一样,基于from的相对引用比较困难,直接重命名即可 这里有m个X(i)每个都是二维数组,而随机产生的质心centroids也是二维数组,那么这里的距离就是二维坐标下的距离 报错: index 2 is out of bounds for axis 1 with size 2 是因为jvli=(X[i,0]-centroids[j,0])**2+(X[i,1]-centroids[j,1])**2代码的列索引溢出,列索引也是从0开始,print(X[1,0])即可检查数据,为啥使用jvliz0=[] 配合jvliz0.append(jvli)结果是错误的呢?原因是在双重循环中.append函数一直在列表末尾添加元素,使得存储Xi到质心距离的列表包含了全部的X,导致错误,在末尾添重新赋值为空即可jvliz0=[] 更推荐以下代码
# UNQ_C1 # GRADED FUNCTION: find_closest_centroids deffind_closest_centroids(X, centroids): """ Computes the centroid memberships for every example Args: X (ndarray): (m, n) Input values centroids (ndarray): k centroids Returns: idx (array_like): (m,) closest centroids """
# Set K ,K is total number of centroids K = centroids.shape[0] m,n=X.shape # You need to return the following variables correctly idx = np.zeros(m, dtype=int) jvliz0=[] # jvliz0=np.zeros(K, dtype=int) ### START CODE HERE ### #计算每个Xi到K个质心的距离,寻找最短的那个质心 for i inrange(m): for j inrange(K): jvli=float((X[i,0]-centroids[j,0])**2+(X[i,1]-centroids[j,1])**2) # jvli = np.linalg.norm(X[i] - centroids[j]) jvliz0.append(jvli) # jvliz0[j]=jvli # print(jvliz0) idx[i]=np.argmin(jvliz0) jvliz0=[] #关键 ### END CODE HERE ### return idx
defcompute_centroids(X, idx, K): """ Returns the new centroids by computing the means of the data points assigned to each centroid. Args: X (ndarray): (m, n) Data points idx (ndarray): (m,) Array containing index of closest centroid for each example in X. Concretely, idx[i] contains the index of the centroid closest to example i K (int): number of centroids Returns: centroids (ndarray): (K, n) New centroids computed """ # Useful variables m, n = X.shape # You need to return the following variables correctly centroids = np.zeros((K, n)) # for i in range(K): # exec(f'X_cz{i}=[]') ### START CODE HERE ### # for i in range(m): # for j in range(K): # if idx[i]==j: # exec(f'X_cz{j}.append(X[i])') for k inrange(K): points=X[idx==k] centroids[k] = np.mean(points, axis = 0) ### END CODE HERE ## return centroids
# UNQ_C1 # GRADED FUNCTION: find_closest_centroids deffind_closest_centroids(X, centroids): """ Computes the centroid memberships for every example Args: X (ndarray): (m, n) Input values centroids (ndarray): k centroids Returns: idx (array_like): (m,) closest centroids """
# Set K ,K is total number of centroids K = centroids.shape[0] m,n=X.shape # You need to return the following variables correctly idx = np.zeros(m, dtype=int) # jvliz0=[] # jvliz0=np.zeros(K, dtype=int) ### START CODE HERE ### #计算每个Xi到K个质心的距离,寻找最短的那个质心 for i inrange(m): jvliz0=[] #关键 for j inrange(K): # jvli=float((X[i,0]-centroids[j,0])**2+(X[i,1]-centroids[j,1])**2) jvli = np.linalg.norm(X[i] - centroids[j]) #jvli=np.sum(np.power((centroids-X[i]),2),1) jvliz0.append(jvli) # jvliz0[j]=jvli # print(jvliz0) idx[i]=np.argmin(jvliz0)
### END CODE HERE ### return idx # Select an initial set of centroids (3 Centroids) initial_centroids = np.array([[3,3], [6,2], [8,5]])
# Find closest centroids using initial_centroids idx = find_closest_centroids(X, initial_centroids)
# Print closest centroids for the first three elements print("First three elements in idx are:", idx[:3]) # UNIT TEST from public_tests2 import *
find_closest_centroids_test(find_closest_centroids) # K=10 # for i in range(K): # exec(f'X_cz{i}=[1,2]') # print(X_cz0) # UNQ_C2 # GRADED FUNCTION: compute_centpods # points=X[idx==0] # print(points) defcompute_centroids(X, idx, K): """ Returns the new centroids by computing the means of the data points assigned to each centroid. Args: X (ndarray): (m, n) Data points idx (ndarray): (m,) Array containing index of closest centroid for each example in X. Concretely, idx[i] contains the index of the centroid closest to example i K (int): number of centroids Returns: centroids (ndarray): (K, n) New centroids computed """ # Useful variables m, n = X.shape # You need to return the following variables correctly centroids = np.zeros((K, n)) # for i in range(K): # exec(f'X_cz{i}=[]') ### START CODE HERE ### # for i in range(m): # for j in range(K): # if idx[i]==j: # exec(f'X_cz{j}.append(X[i])') for k inrange(K): points=X[idx==k] centroids[k] = np.mean(points, axis = 0) ### END CODE HERE ## return centroids K = 3 centroids = compute_centroids(X, idx, K) print("The centroids are:", centroids) # You do not need to implement anything for this part
defrun_kMeans(X, initial_centroids, max_iters=10, plot_progress=False): """ Runs the K-Means algorithm on data matrix X, where each row of X is a single example """ # Initialize values m,n=X.shape K=initial_centroids.shape[0] centroidsnow=initial_centroids centroidsold=centroidsnow
idx = np.zeros(m) # Run K-Means for i inrange(max_iters): print(f"当前迭代次数{i+1},最大次数{max_iters},预计还剩{max_iters-i-1}次") idx=find_closest_centroids(X, centroidsnow) centroidsold=centroidsnow centroidsnow = compute_centroids(X, idx, K) #打印比较质心位置以判断收敛情况 print(centroidsnow==centroidsold)
# You do not need to modify this part # 这个随机算法是先把X打乱然后把前K个X直接作为初试质心 defkMeans_init_centroids(X, K): """ This function initializes K centroids that are to be used in K-Means on the dataset X Args: X (ndarray): Data points K (int): number of centroids/clusters Returns: centroids (ndarray): Initialized centroids """ # Randomly reorder the indices of examples randidx = np.random.permutation(X.shape[0]) # Take the first K examples as centroids centroids = X[randidx[:K]] return centroids # Load an image of a bird这一步只是加载而已 original_img = plt.imread('./data/bird_small.png') # 可视化图像 #您可以使用下面的代码可视化刚刚加载的图像 plt.imshow(original_img) """ 检查变量的维度 与往常一样,您将打印出变量的形状以更熟悉数据。 """ print("Shape of original_img is:", original_img.shape) # Divide by 255 so that all values are in the range 0 - 1 original_img = original_img / 255 # Reshape the image into an m x 3 matrix where m = number of pixels # (in this case m = 128 x 128 = 16384) # Each row will contain the Red, Green and Blue pixel values # This gives us our dataset matrix X_img that we will use K-Means on. # X_img大小为m*3 *3是因为R G B三个通道 X_img = np.reshape(original_img, (original_img.shape[0] * original_img.shape[1], 3)) # Run your K-Means algorithm on this data # You should try different values of K and max_iters here K = 16 max_iters = 100 # Using the function you have implemented above. initial_centroids_t=kMeans_init_centroids(X_img,K) # Run K-Means - this takes a couple of minutes centroids_t,idx_t=run_kMeans(X_img,initial_centroids_t,max_iters) # idx_t=find_closest_centroids(X_img,centroids_t) print("Shape of idx_t:", idx_t.shape) print("Closest centroid for the first five elements:", idx_t[:5]) # Represent image in terms of indices X_recovered = centroids_t[idx_t,:] # Reshape recovered image into proper dimensions X_recovered = np.reshape(X_recovered, original_img.shape) # Display original image fig, ax = plt.subplots(1,2, figsize=(8,8)) plt.axis('off')
有点类似之前的线性回归预测房价,这里是根据用户的喜好w和电影特征x^i预测用户对该片的打分 符号: |General Notation | Description| Python (if any) | |:-------------|:------------------------------------------------------------|| |r(i,j) | scalar; = 1 if user j rated game i = 0 otherwise || |y(i,j) | scalar; = rating given by user j on game i (if r(i,j) = 1 is defined) || |w(j) | vector; parameters for user j || |b(j) | scalar; parameter for user j || |x(i) | vector; feature ratings for movie i || |nu | number of users |num_users| |nm | number of movies | num_movies | |n | number of features | num_features | |X | matrix of vectorsx(i) | X | |W | matrix of vectorsw(j) | W | |b | vector of bias parametersb(j) | b | |R | matrix of elementsr(i,j) | R | 2 - 推荐系统Image 在本实验中,您将实现协作过滤学习算法,并将其应用于电影分级数据集。 协作过滤推荐系统的目标是生成两个向量:对于每个用户,一个体现用户电影品味的“参数向量”。对于每部电影,一个相同大小的特征向量,它体现了电影的一些描述。两个向量的点积加上偏差项应产生用户可能对该电影的评分的估计值。 下图详细介绍了如何学习这些向量。 现有评级以矩阵形式提供,如图所示。 Y包含评级;0.5 到 5 分,分 0.5 步进。如果电影尚未评级,则为 0。 R电影评分为 1。电影在行中,用户在列中。每个用户都有一个参数向量 wuser 和偏见。每部电影都有一个特征向量 .通过使用现有的用户/电影评级作为训练数据来同时学习这些向量。上面显示了一个训练示例:w(1)⋅x(1)+b(1)=4 .值得注意的是,特征向量xmovie 必须满足所有用户,而用户向量wuser必须满足所有电影,也就是用户向量wuser的维数和特征向量xmovie 的特征数相同,使其支持正常内积。这是此方法名称的来源 - 所有用户协作生成评级集。 一旦学习了特征向量和参数,它们就可以用来预测用户如何对未分级的电影进行评分。如上图所示。该等式是预测用户 1 对电影零的评分的示例。 在本练习中,您将实现计算协同过滤的函数 目标函数。实现目标函数后,您将使用 TensorFlow 自定义训练循环来学习用于协同过滤的参数。第一步是详细说明将在实验室中使用的数据集和数据结构。
defcofi_cost_func(X, W, b, Y, R, lambda_): """ Returns the cost for the content-based filtering Args: X (ndarray (num_movies,num_features)): matrix of item features W (ndarray (num_users,num_features)) : matrix of user parameters b (ndarray (1, num_users) : vector of user parameters Y (ndarray (num_movies,num_users) : matrix of user ratings of movies R (ndarray (num_movies,num_users) : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user lambda_ (float): regularization parameter Returns: J (float) : Cost """ nm, nu = Y.shape J = 0 # ### START CODE HERE ### for j inrange(nu): Wj=W[j,:] #b只能取一个数 bj=b[0,j] for i inrange(nm): Xi=X[i, :] Yij=Y[i, j] r=R[i,j] J+=np.square(r*(np.dot(Wj,Xi)+bj-Yij)) J += lambda_ * (np.sum(np.square(W)) + np.sum(np.square(X))) J=(1/2)*J
### END CODE HERE ###
return J
r=R[i,j]这句如果改为R=R[i,j]会报错因为传进来的形参是R而且上下文中多次定义了R,np.square平方操作的时候括号位置别写错,这里的正则化只是对 W 数组和 X 数组的每个元素进行平方所以直接用np.square就行。 矢量化实现 创建矢量化实现进行计算非常重要 ,因为它稍后会在优化过程中多次调用。所使用的线性代数不是本系列的重点,因此提供了实现。如果您是线性代数方面的专家,请随时创建您的版本,而无需引用下面的代码。 运行下面的代码,并验证它是否生成与非矢量化版本相同的结果。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
defcofi_cost_func_v(X, W, b, Y, R, lambda_): """ Returns the cost for the content-based filtering Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop. Args: X (ndarray (num_movies,num_features)): matrix of item features W (ndarray (num_users,num_features)) : matrix of user parameters b (ndarray (1, num_users) : vector of user parameters Y (ndarray (num_movies,num_users) : matrix of user ratings of movies R (ndarray (num_movies,num_users) : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user lambda_ (float): regularization parameter Returns: J (float) : Cost """ j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2)) return J
矢量化就是加快了运行速度因为减少了for循环
学习电影推荐图像
完成实施协作筛选成本后 函数,你可以开始训练你的算法来制作 为自己推荐电影。 在下面的单元格中,您可以输入自己的电影选择。然后,该算法将为您提出建议!我们根据自己的喜好填写了一些值,但是在您按照我们的选择进行操作后,您应该更改它以符合您的口味。 数据集中所有电影的列表位于文件电影列表中。 其中movieList, movieList_df = load_Movie_List_pd()中第一个参数用于打印电影标题,第二个参数用于打印csv文件内容包括序号 以下代码用于存储被我评价的电影对应的序号,并按升序排列my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0] 我们的新建的按照序号对电影评价代码如下:
my_ratings = np.zeros(num_movies) # Initialize my ratings
# Check the file small_movie_list.csv for id of each movie in our dataset # For example, Toy Story 3 (2010) has ID 2700, so to rate it "5", you can set my_ratings[2700] = 5
#Or suppose you did not enjoy Persuasion (2007), you can set my_ratings[2609] = 2;
# We have selected a few movies we liked / did not like and the ratings we # gave are as follows: my_ratings[929] = 5# Lord of the Rings: The Return of the King, The my_ratings[246] = 5# Shrek (2001) my_ratings[2716] = 3# Inception my_ratings[1150] = 5# Incredibles, The (2004) my_ratings[382] = 2# Amelie (Fabuleux destin d'Amélie Poulain, Le) my_ratings[366] = 5# Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001) my_ratings[622] = 5# Harry Potter and the Chamber of Secrets (2002) my_ratings[988] = 3# Eternal Sunshine of the Spotless Mind (2004) my_ratings[2925] = 1# Louis Theroux: Law & Disorder (2008) my_ratings[2937] = 1# Nothing to Declare (Rien à déclarer) my_ratings[793] = 5# Pirates of the Caribbean: The Curse of the Black Pearl (2003) #用于存储被我评价的电影对应的序号,并按升序排列 my_rated = [i for i inrange(len(my_ratings)) if my_ratings[i] > 0] print(my_rated)
print('\nNew user ratings:\n') for i inrange(len(my_ratings)): if my_ratings[i] > 0 : print(f'Rated {my_ratings[i]} for {movieList_df.loc[i,"title"]}');
import numpy as np import tensorflow as tf from tensorflow import keras from recsys_utils import * #Load data X, W, b, num_movies, num_features, num_users = load_precalc_params_small() Y, R = load_ratings_small() print("Y", Y.shape, "R", R.shape) print("X", X.shape) print("W", W.shape) print("b", b.shape) print("num_features", num_features) print("num_movies", num_movies) print("num_users", num_users) # From the matrix, we can compute statistics like average rating. # list = [] # for i in range(443): # list.append(False) # print(np.array(list).shape) a=Y[0, R[0, :].astype(bool)] # b=R[0, :].astype(bool)
# GRADED FUNCTION: cofi_cost_func # UNQ_C1 defcofi_cost_func(X, W, b, Y, R, lambda_): """ Returns the cost for the content-based filtering Args: X (ndarray (num_movies,num_features)): matrix of item features W (ndarray (num_users,num_features)) : matrix of user parameters b (ndarray (1, num_users) : vector of user parameters Y (ndarray (num_movies,num_users) : matrix of user ratings of movies R (ndarray (num_movies,num_users) : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user lambda_ (float): regularization parameter Returns: J (float) : Cost """ nm, nu = Y.shape J = 0 # ### START CODE HERE ### for j inrange(nu): Wj=W[j,:] #b只能取一个数 bj=b[0,j] for i inrange(nm): Xi=X[i, :] Yij=Y[i, j] r=R[i,j] J+=np.square(r*(np.dot(Wj,Xi)+bj-Yij))
# Evaluate cost function J = cofi_cost_func(X_r, W_r, b_r, Y_r, R_r, 0); print(f"Cost: {J:0.2f}") # Evaluate cost function with regularization J = cofi_cost_func(X_r, W_r, b_r, Y_r, R_r, 1.5); print(f"Cost (with regularization): {J:0.2f}") defcofi_cost_func_v(X, W, b, Y, R, lambda_): """ Returns the cost for the content-based filtering Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop. Args: X (ndarray (num_movies,num_features)): matrix of item features W (ndarray (num_users,num_features)) : matrix of user parameters b (ndarray (1, num_users) : vector of user parameters Y (ndarray (num_movies,num_users) : matrix of user ratings of movies R (ndarray (num_movies,num_users) : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user lambda_ (float): regularization parameter Returns: J (float) : Cost """ j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2)) return J # Evaluate cost function J = cofi_cost_func_v(X_r, W_r, b_r, Y_r, R_r, 0); print(f"Cost: {J:0.2f}")
# Evaluate cost function with regularization J = cofi_cost_func_v(X_r, W_r, b_r, Y_r, R_r, 1.5); print(f"Cost (with regularization): {J:0.2f}")\
my_ratings = np.zeros(num_movies) # Initialize my ratings
# Check the file small_movie_list.csv for id of each movie in our dataset # For example, Toy Story 3 (2010) has ID 2700, so to rate it "5", you can set my_ratings[2700] = 5
#Or suppose you did not enjoy Persuasion (2007), you can set my_ratings[2609] = 2;
# We have selected a few movies we liked / did not like and the ratings we # gave are as follows: my_ratings[929] = 5# Lord of the Rings: The Return of the King, The my_ratings[246] = 5# Shrek (2001) my_ratings[2716] = 3# Inception my_ratings[1150] = 5# Incredibles, The (2004) my_ratings[382] = 2# Amelie (Fabuleux destin d'Amélie Poulain, Le) my_ratings[366] = 5# Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001) my_ratings[622] = 5# Harry Potter and the Chamber of Secrets (2002) my_ratings[988] = 3# Eternal Sunshine of the Spotless Mind (2004) my_ratings[2925] = 1# Louis Theroux: Law & Disorder (2008) my_ratings[2937] = 1# Nothing to Declare (Rien à déclarer) my_ratings[793] = 5# Pirates of the Caribbean: The Curse of the Black Pearl (2003) #用于存储被我评价的电影对应的序号,并按升序排列 my_rated = [i for i inrange(len(my_ratings)) if my_ratings[i] > 0] print(my_rated)
print('\nNew user ratings:\n') for i inrange(len(my_ratings)): if my_ratings[i] > 0 : print(f'Rated {my_ratings[i]} for {movieList_df.loc[i,"title"]}'); # Reload ratings and add new ratings Y, R = load_ratings_small() Y = np.c_[my_ratings, Y] R = np.c_[(my_ratings != 0).astype(int), R]
# Normalize the Dataset Ynorm, Ymean = normalizeRatings(Y, R) # Useful Values num_movies, num_users = Y.shape num_features = 100 # Useful Values num_movies, num_users = Y.shape num_features = 100 """ 以下代码用于初始化W,X,b以便 作用在梯度下降中 """ # Set Initial Parameters (W, X), use tf.Variable to track these variables tf.random.set_seed(1234) # for consistent results W = tf.Variable(tf.random.normal((num_users, num_features),dtype=tf.float64), name='W') X = tf.Variable(tf.random.normal((num_movies, num_features),dtype=tf.float64), name='X') b = tf.Variable(tf.random.normal((1, num_users), dtype=tf.float64), name='b')
# Instantiate an optimizer.Adam优化 optimizer = keras.optimizers.Adam(learning_rate=1e-1)
# 已知成本函数运行梯度下降迭代 iterations = 200 lambda_ = 1 foriterinrange(iterations): # Use TensorFlow’s GradientTape # to record the operations used to compute the cost with tf.GradientTape() as tape:
# Compute the cost (forward pass included in cost) cost_value = cofi_cost_func_v(X, W, b, Ynorm, R, lambda_)
# Use the gradient tape to automatically retrieve # the gradients of the trainable variables with respect to the loss grads = tape.gradient( cost_value, [X,W,b] )
# Run one step of gradient descent by updating # the value of the variables to minimize the loss. optimizer.apply_gradients( zip(grads, [X,W,b]) )
# Log periodically. ifiter % 20 == 0: print(f"Training loss at iteration {iter}: {cost_value:0.1f}")
# Make a prediction using trained weights and biases 做出预测 p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()
#restore the mean 可能和之前标准化过有关系 pm = p + Ymean
my_predictions = pm[:,0]
# sort predictions 对预测结果进行降序排序 ix = tf.argsort(my_predictions, direction='DESCENDING') """ 由于上文使用了降序,所以这里取前17个评分 最高的电影由于每次迭代产生的W,b不同所以预测出的 电影可能不一样,如果不在之前我的评论中则 显示该电影的预测分数 """ for i inrange(17): j = ix[i] if j notin my_rated: print(f'Predicting rating {my_predictions[j]:0.2f} for movie {movieList[j]}') """ 如果之前对电影做出过评论 则显示原始值和预测值 """ print('\n\nOriginal vs Predicted ratings:\n') for i inrange(len(my_ratings)): if my_ratings[i] > 0: print(f'Original {my_ratings[i]}, Predicted {my_predictions[i]:0.2f} for {movieList[i]}') """ 利用额外的信息来增强我们的预测其实不应该放在最后 为了防止歧义因此注释掉 """ # filter=(movieList_df["number of ratings"] > 20) # movieList_df["pred"] = my_predictions # movieList_df = movieList_df.reindex(columns=["pred", "mean rating", "number of ratings", "title"]) # movieList_df.loc[ix[:300]].loc[filter].sort_values("mean rating", ascending=False)
num_user_features = user_train.shape[1] - 3# remove userid, rating count and ave rating during training num_item_features = item_train.shape[1] - 1# remove movie id at train time uvs = 3# user genre vector start ivs = 3# item genre vector start u_s = 3# start of columns to use in training, user i_s = 1# start of columns to use in training, items scaledata = True# applies the standard scalar to data if true print(f"Number of training vectors: {len(item_train)}")
使用sklearn数据预处理,Fit(): Method calculates the parameters μ and σ and saves them as internal objects. 解释:简单来说,就是求得训练集X的均值啊,方差啊,最大值啊,最小值啊这些训练集X固有的属性。可以理解为一个训练过程 Transform(): Method using these calculated parameters apply the transformation to a particular dataset. 解释:在Fit的基础上,进行标准化,降维,归一化等操作(看具体用的是哪个工具,如PCA,StandardScaler等)。 y_train包含用户对电影的真实评分数据可用于后续处理。 来拆分和洗牌数据
1 2 3 4 5 6 7
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1) user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1) y_train, y_test = train_test_split(y_train, train_size=0.80, shuffle=True, random_state=1) print(f"movie/item training data shape: {item_train.shape}") print(f"movie/item test data shape: {item_test.shape}") #缩放、随机排列的数据现在的平均值为零。 pprint_train(user_train, user_features, uvs, u_s, maxcount=5)
uid = 36 # form a set of user vectors. This is the same vector, transformed and repeated. user_vecs, y_vecs = get_user_vecs(uid, scalerUser.inverse_transform(user_train), item_vecs, user_to_genre)
# scale the vectors and make predictions for all movies. Return results sorted by rating. sorted_index, sorted_ypu, sorted_items, sorted_user = predict_uservec(user_vecs, item_vecs, model, u_s, i_s, scaler, scalerUser, scalerItem, scaledata=scaledata) sorted_y = y_vecs[sorted_index]