前言

吴恩达有一套课程Deep Learning，对机器学习的基础理论做了非常好的介绍，上课视频质量非常好，而且练习题都设计得很有水平，并提供了Matlab的答案。本文针对这些练习题，提供了一份R语言版的答案。

练习2-线性回归

题目与数据请点这里

# read data: x is age, y is height
x <- read.csv("ex2x.dat", header=F)
y <- read.csv("ex2y.dat", header=F)
colnames(x) <- c("age")
colnames(y) <- c("height")

# scatter plot
input <- data.frame(age=x$age,height=y$height)
library(ggplot2)
p <- ggplot(aes(x=age, y=height), data=input)
p + geom_point(size=3,shape=3) + theme_bw() + xlab("Age in years") + ylab("Height in meters")

# gradient descend algorithm implementation
x <- cbind(c(rep(1, nrow(x))), x$age)
y <- as.matrix(y)
theta <- c(rep(0,ncol(x)))
alpha <- 0.07
MAX_ITR <- 300
for(i in 1:MAX_ITR){
    grad <- 1/nrow(x) * t(x) %*% (x %*% theta - y) 
    theta <- theta - alpha * grad
}
print(theta)

# add fitted line to plot
p + geom_point(size=3,shape=4) + geom_abline(intercept=theta[1], slope=theta[2], size=1) + geom_smooth(method="lm", size=1) + theme_bw()+ xlab("Age in years") + ylab("Height in meters")

拟合效果如下图，蓝色线为最小二乘法得到的直线，黑色为最大迭代次数MAX_ITR设为300时的直线。当MAX_ITR设为1500时，黑色直线将于蓝色直线非常接近。

练习3-多元线性回归

题目与数据请点这里

# read raw data
x <- read.table("ex3x.dat", header=F)
y <- read.table("ex3y.dat", header=F)
colnames(x) <- c("area", "room_num")
colnames(y) <- c("price")

# scatter plot
# input <- data.frame(area=x$area, room_num=x$room_num, price=y$price)
# plot(input)

# normalization
x <- cbind(rep(1, nrow(x)), x$area, x$room_num)
x[, 2:3] <- scale(x[, 2:3])
y <- as.matrix(y)

# learning rate experiment
library(ggplot2)
iter_num <- 50
alpha <- c(0.01, 0.03, 0.1, 0.3, 1, 1.3)
plot_data <- NULL
plot_color <- c('black', 'red', 'green', 'blue', 'orange', 'purple')
for(j in 1:length(alpha)){ 
  theta <- c(rep(0, ncol(x)))  
  Jtheta <- c(rep(0, iter_num))  
  for(i in 1:iter_num){    
    Jtheta[i] <- (1/(2*nrow(x))) * t(x %*% theta - y) %*% (x %*% theta - y)
    grad <- 1/nrow(x) * t(x) %*% (x %*% theta -y)   
    theta <- theta -alpha[j] * grad   
  }

  if(is.null(plot_data)){
    plot_data <- data.frame(iter=c(1:iter_num), cost=Jtheta, rate=alpha[j])
  }
  else {
    plot_data <- rbind(plot_data,data.frame(iter=c(1:iter_num), cost=Jtheta, rate=alpha[j])
  }
}
p <- ggplot(plot_data)
p <- p + geom_line(aes(x=iter, y=cost, colour=as.factor(rate), group=rate))
p + theme_bw() + scale_colour_hue(name="learning rate")

# We find best convergency occurs when learning rate is 1
alpha <- 1.0
iter_num <- 100
theta <- c(rep(0, ncol(x)))  
for(i in 1:iter_num){    
  grad <- 1/nrow(x) * t(x) %*% (x %*% theta -y)   
  theta <- theta -alpha * grad   
}

# prediction for price of an apartment with area=1650 and room_num=3
x <- read.table("ex3x.dat", header=F)
y <- read.table("ex3y.dat", header=F)
colnames(x) <- c("area", "room_num")
colnames(y) <- c("price")
price_pred <- t(theta) %*% c(1, (1650-mean(x$area))/sd(x$area), (3-mean(x$room_num))/sd(x$room_num))
print(price_pred)

其中学习率选择的曲线如下，可以看到设定为1时收敛最快。

练习4-逻辑回归

题目和数据请点这里

# read raw data
x <- read.table("ex4Data/ex4x.dat", header=F)
y <- read.table("ex4Data/ex4y.dat", header=F)
colnames(x) <- c("t1_mark", "t2_mark")
colnames(y) <- c("admit")

# plot the data
input <- data.frame(t1_mark=x$t1_mark, t2_mark=x$t2_mark, admit=y$admit)
library(ggplot2)
p <- ggplot(data=input, aes(x=t1_mark, y= t2_mark, colour=as.factor(admit)))

p + geom_point() + theme_bw() + scale_colour_hue(name="Admit result",breaks=c(0,1),labels=c("Not admitted","Admitted")) + xlab("Test 1 score") + ylab("Test 2 score")

# Newton-Raphson's method implementation
sigmoid_value <- function(z){
  return (1/(1+exp(-z)))
}
x <- cbind(rep(1,nrow(x)), x$t1_mark, x$t2_mark)
y <- as.matrix(y) 
iter_num <- 7
theta <- rep(0, ncol(x))
Jtheta <- rep(0, iter_num)
for(i in 1:iter_num){
  z <- x %*% theta
  h <- sigmoid_value(z)
  # get gradient, Hession and cost function
  grad <- 1/nrow(x) * t(x)%*%(h-y)
  H <- (1/nrow(x)) * t(x) %*% diag(as.vector(h)) %*% diag(1-as.vector(h)) %*% x
  Jtheta[i] <- 1/nrow(x) * sum(-y*log(h)-(1-y)*log(1-h)) 
  theta <- theta - solve(H)%*%grad
}
curve <- data.frame(iter=c(1:iter_num), cost=Jtheta)
ggplot(data=curve,aes(x=iter,y=cost)) + geom_line() + geom_point() + theme_bw()
theta

# Predict the not admit probability of a student with t1_score=20 and t2_score=80
prob <- 1 - sigmoid_value(c(1,20,80)%*%theta)

# Plot the decision boundary line
plot_x <- c(min(x[,2])-2, max(x[,2])+2)
plot_y <- (-1/theta[3])*(theta[2]*plot_x+theta[1])
p + geom_point() + theme_bw() + geom_segment(x=plot_x[1],xend=plot_x[2],y=plot_y[1],yend=plot_y[2], colour="black") + scale_colour_hue(name="Admit result",breaks=c(0,1),labels=c("Not admitted","Admitted")) + xlab("Test 1 score") + ylab("Test 2 score")

# 此处尝试用梯度下降法求解逻辑回归，发现很久不收敛，不知道是不是写错了
# Gradient dscend algorithm implementation
iter_num <- 10000
#alpha <- c(0.01, 0.03, 0.1, 0.3, 1, 1.3)
alpha <- c(0.00001,0.00003,0.0001,0.0003,0.001,0.0013,0.0014)
plot_data <- NULL
for(j in 1:length(alpha)){
  Jtheta <- rep(0, iter_num)
  theta <- rep(0, ncol(x))
  for(i in 1:iter_num){
    z <- x%*%theta
    h <- sigmoid_value(z)
    Jtheta[i] <- 1/nrow(x) * sum(-y*log(h)-(1-y)*log(1-h)) 
    grad <- 1/nrow(x) * t(x)%*%(h-y)
    theta <- theta - alpha[j] * grad
  }

  if(is.null(plot_data)){
    plot_data <- data.frame(iter=c(1:iter_num), cost=Jtheta, rate=alpha[j])
  }
  else {
    plot_data <- rbind(plot_data,data.frame(iter=c(1:iter_num), cost=Jtheta, rate=alpha[j]))
  }  
}
p <- ggplot(plot_data)
p <- p + geom_line(aes(x=iter, y=cost, colour=as.factor(rate), group=rate))
p + theme_bw() + scale_colour_hue(name="learning rate")

下图显示了牛顿法的收敛过程，4步就cost收敛到较低水平了。

下图是使用逻辑回归分类的效果和决策边界：

下图是梯度下降的收敛过程，久久不收敛，估计是代码哪里写错了，暂时没搞明白。

练习5-正则化

题目和数据请点这里

该题目分为线性回归与逻辑回归两个部分。

线性回归

# read raw data
x <- read.table("ex5Linx.dat",header=F)
y <- read.table("ex5Liny.dat",header=F)

# plot the data
library(ggplot2)
input <- cbind(x,y)
colnames(input) <- c("x","y")
p <- ggplot() + theme_bw()
p <- p + geom_point(data=input,aes(x=x,y=y),colour=rgb(213/255,26/255,33/255)) 
p

#normal equation method
f1 <- function(x, theta){
  return (as.vector((matrix(c(rep(1,length(x)),x,x^2,x^3,x^4,x^5),nrow=length(x), ncol=6, byrow=FALSE) %*% theta)))
}

lamda <- c(0,1,10)
x <- as.matrix(cbind(rep(1,nrow(x)),x,x^2,x^3,x^4,x^5))
y <- as.matrix(y)
rm <- diag(c(0, rep(1,ncol(x)-1)))

curve <- NULL
for(i in 1:length(lamda)){
  theta <- solve((t(x)%*%x+lamda[i]*rm))%*%t(x)%*%y
  xx <- seq(-1,1,0.001)
  yy <- f1(xx, theta)
  if(is.null(curve)){
    curve <- data.frame(x=xx,y=yy,lamda=rep(lamda[i],length(xx)))
  }
  else {
    curve <- rbind(curve, data.frame(x=xx,y=yy,lamda=rep(lamda[i],length(xx))))
  }
  print(theta)
}

p <- p + geom_line(data=curve, aes(x=x,y=y,group=lamda,colour=as.factor(lamda)))

p

下图是拟合效果。可以看到当正则项的惩罚系数越大时，拟合效果越差；当惩罚系数过小时，又存在过拟合问题。

逻辑回归

x <- read.table("ex5Logx.dat",sep=",", header=F)
y <- read.table("ex5Logy.dat", header=F)
colnames(x) <- c("u","v")
colnames(y) <- c("type")

#plot the data
input <- data.frame(u=x$u,v=x$v,type=as.factor(y$type))
p <- ggplot() + theme_bw()
p <- p + geom_point(data=input,aes(x=u,y=v,shape=type, size=3))
#p <- p + scale_colour_manual(values=c('blue','red')) + labs(colour='Type')
p <- p + scale_shape_manual(values=c('o','+')) + labs(shape='Type')
p <- p + scale_size(guide="none")
p

#utility function
map_feature <- function(feat1, feat2){
  degree <- 6
  out = rep(1,length(feat1))
  for(i in 1:degree){
    for(j in 0:i){
      out <- cbind(out,(feat1^(i-j)*feat2^j))
    }
  }
  return (out)
}

f1 <- function(x, theta){
  return (as.vector(map_feature(x$u,x$v) %*% theta))
}

#sigmoid function
sigmoid_value <- function(z){
  return (1/(1+exp(-z)))
}

#Newton-Raphson's Method

lambda <- c(0,1,10)
out <- map_feature(x$u,x$v)
y <- as.matrix(y)
curve <- NULL
con_store <- NULL
u <- seq(-1,1.5,0.05)
v <- seq(-1,1.5,0.05)
for(i in 1:length(lambda)){
  iter_num <- 15
  theta <- rep(0, ncol(out))
  Jtheta <- rep(0, iter_num)
  con <- matrix(0, nrow=length(u)*length(v), ncol=4)
  for(j in 1:iter_num){
    h <- sigmoid_value(out%*%theta)
    #cost function, judge if converged
    Jtheta[j] <- 1/nrow(out)*sum(-y*log(h)-(1-y)*log(1-h)) + lambda[i]/(2*nrow(out))*sum(theta[-1]^2)
    #gradient
    grad <- 1/nrow(out)*t(out)%*%(h-y) + lambda[i]/nrow(out)*c(0, theta[-1])
    #hession
    H <- 1/nrow(out)*t(out)%*%diag(as.vector(h))%*%diag(1-as.vector(h))%*%out + lambda[i]/nrow(out)*diag(c(0, rep(1,ncol(out)-1)))
    theta <- theta - solve(H)%*%grad
  }
  if(is.null(curve)){
    curve <- data.frame(iter=c(1:iter_num), cost=Jtheta, lambda=lambda[i])
  }
  else {
    curve <- rbind(curve,data.frame(iter=c(1:iter_num), cost=Jtheta, lambda=lambda[i]))
  }

  for(k in 1:length(u)){
    for(l in 1:length(v)){
      z <- map_feature(u[k],v[l])%*%theta
      con[(k-1)*length(v)+l, 1] = u[k]
      con[(k-1)*length(v)+l, 2] = v[l]
      con[(k-1)*length(v)+l, 3] = z
      con[(k-1)*length(v)+l, 4] = lambda[i]
    }
  }
  if(is.null(con_store)){
    con_store <- as.data.frame(con)
  }
  else {
    con_store <- rbind(con_store,con)
  }
}

#plot convergency draft
ggplot(data=curve) + theme_bw() + geom_point(aes(x=iter, y=cost)) + geom_line(aes(x=iter,y=cost, group=lambda, colour=as.factor(lambda))) + scale_color_hue(name="lambda")
colnames(con_store) <- c("x","y","z","lambda")
p <- p + stat_contour(data=con_store,aes(x=x,y=y,z=z,group=as.factor(lambda), colour=as.factor(lambda), size=2),breaks=c(0)) + scale_color_hue(name="Lambda")
p

下图是不同lambda惩罚系数的cost收敛效果。

下图是逻辑回归在不同惩罚系数下的分类效果，lambda=0时存在过拟合。

关于作者：丹追兵：数据分析师一枚，编程语言python和R，使用Spark、Hadoop、Storm、ODPS。本文出自丹追兵的pytrafficR专栏，转载请注明作者与出处：https://segmentfault.com/blog...