# # Simple linear regression # # The "Advertising" data is not contained in the ISLR package, but can be # downloaded from the web: Advertising = read.csv("Advertising.csv",row.names=1) # some numeric and graphical summaries summary(Advertising) # Fit the simple linear regression model with predictor TV and response Sales mylm = lm(Sales~TV,data=Advertising) attach(Advertising) plot(TV,Sales, type='b',pch=19) abline(mylm) # abline draws a line, when given an intercept and slope, or a linear model summary(mylm) # displays basic summaries of the lm # Note that the basic summaries above include much of the information That we # need, including coefficient estimates, standard errors, t-stat and p-values # for testing a null hypothesis of a zero value, and residual standard error, # R^2 and a F-statistic (see later). confint(mylm) # makes confidence intervals for the coefficients of the lm # show some diagnostics for the fitted model par(mfrow=c(2,2)) plot(mylm) # # Multiple linear regression # mylm2 = lm(Sales~TV+Radio+Newspaper,data=Advertising) summary(mylm2) # X <- read.table("sim.txt", header = TRUE) names(X) attach(X) out <- lm(y ~ x) par(mar = c(5, 4, 1, 1) + 0.1) plot(x, y) abline(out) out.poly <- lm(y ~ poly(x, 9)) summary(out.poly) par(mfrow=c(1,2)) plot(x, y) curve(predict(out.poly, data.frame(x = x)), add = TRUE) abline(out, lty = 2) xx <- seq(min(x), max(x), length = 1000) yy <- predict(out.poly, data.frame(x = xx)) plot(x, y, ylim = range(yy)) curve(predict(out.poly, data.frame(x = x)), add = TRUE) abline(out, lty = 2) # # Model selection for sim.txt dat # n <- length(y) deg <- seq(1, 8, 1) cp <- length(deg) aic <- length(deg) bic <- length(deg) out.big <- lm(y ~ poly(x, 2)) sigsqhat.big <- summary(out.big)$sigma^2 for (i in seq(along = deg)) { k <- deg[i] out <- lm(y ~ poly(x, k)) aic[i] <- AIC(out) bic[i] <- AIC(out, k = log(n)) cp[i] <- sum(out$residuals^2)/sigsqhat.big + 2 * out$rank - n } foo <- cbind(deg, cp, aic, bic) dimnames(foo) <- list(NULL, c("degree", "Cp", "AIC", "BIC")) foo par(mfrow=c(1,1)) plot(x, y) out.poly.CPorBIC<- lm(y ~ poly(x, 3)) out.poly.AIC <- lm(y ~ poly(x, 6)) out.poly.big <- lm(y ~ poly(x, 8)) out.linear <- lm(y ~ x) abline(out.linear, col=4) curve(predict(out.poly.AIC, data.frame(x = x)), add = TRUE, col=1) curve(predict(out.poly.big, data.frame(x = x)), add = TRUE, col=2) curve(predict(out.poly.CPorBIC, data.frame(x = x)), add = TRUE, col=3) legend('topleft',c('AIC', 'CpBIC','Poly(8)','linear'),text.col=c(1,2,3,4)) # # Variable selection using subset selection for Advertising data # mylm = lm(Sales~TV+Radio+Newspaper,data=Advertising) summary(mylm) # Stepwise variable selection step(mylm) # default is backward when just a model is given. # To do forward stepwise, first fit a "null" model with just an intercept nulllm = lm(Sales~1,data=Advertising) mylm2 = step(nulllm,scope=list(lower=nulllm,upper=mylm),direction='forward') # the above command is different in several ways from forward stepwise. #1) it specifies a lower and upper bound of models to consider, #2) the result is assigned to "mylm2", saving the resultant linear model as a new model. summary(mylm2) # we can manipulate "mylm2" just like a model we fit via "lm" (above) #Using AIC or BIC step(nulllm,scope=list(lower=nulllm,upper=mylm),k=2) #AIC n=dim(Advertising)[1] step(nulllm,scope=list(lower=nulllm,upper=mylm),k=log(n),trace=0)#BIC # # Using leaps package # library(leaps) mylm4=regsubsets(Sales~TV+Radio+Newspaper,data=Advertising) par(mfrow=c(1,3)) plot(mylm4, scale="adjr2") plot(mylm4, scale="bic") plot(mylm4, scale="Cp") ######Case studies for Credit card data ######## credit = read.csv("Credit.csv",row.names=1) head(credit) names(credit) [1] "Income" "Limit" "Rating" "Cards" "Age" "Education" [7] "Gender" "Student" "Married" "Ethnicity" "Balance" dim(credit) table(credit$Cards) table(credit$Gender) table(credit$Student) table(credit$Married) table(credit$Ethnicity) summary(credit[,c(1:5,11)]) summary(lm(Balance~Gender,data=credit)) summary(lm(Balance~Ethnicity,data=credit)) summary(lm(Balance~Gender+Ethnicity+Income,data=credit)) summary(lm(Balance~Income*Gender,data=credit)) full=lm(Balance~.,data=credit) null=lm(Balance~1,data=credit) #run a full stepwise search using BIC first n=dim(credit)[1] final.bic=step(null,scope=list(lower=null,upper=full),k=log(n),trace=0)#BIC summary(final.bic) final.aic=step(null,scope=list(lower=null,upper=full),k=2,trace=0)#BIC summary(final.aic) lmcredit=regsubsets(Balance~.,data=credit) par(mfrow=c(1,3)) plot(lmcredit, scale="adjr2") plot(lmcredit, scale="bic") plot(lmcredit, scale="Cp") # # High-dimensional data analysis # install.packages('parcor') library(parcor) install.packages('ncvreg') library(ncvreg) install.packages('lassoshooting') library(lassoshooting) #Fit coefficients paths for MCP- or SCAD-penalized regression models over a grid of values for #the regularization parameter lambda. Fits linear and logistic regression models, with option for an #additional L2 penalty. ncvreg(X, y, family=c("gaussian", "binomial", "poisson"), penalty=c("MCP", "SCAD", "lasso"), gamma=switch(penalty, SCAD=3.7, 3), alpha=1, lambda.min=ifelse(n>p,.001,.05), nlambda=100, lambda, eps=.001, max.iter=1000, convex=TRUE, dfmax=p+1, penalty.factor=rep(1, ncol(X)), warn=TRUE, returnX=FALSE, ...) #For lasso linear regression ncvreg(X, y, family="gaussian", penalty="lasso") # or default one # Prostate data, pn #load("nci.rda") load("nci_MAD.rda") names(nci_MAD) [1] "x" "y" dim(nci_MAD$x) # [1] 59 500 length(nci_MAD$y) fit_nci <- ncvreg(nci_MAD$x,nci_MAD$y) plot(fit_nci) ## Lasso solution path cvfit_nci <- cv.ncvreg(nci_MAD$x,nci_MAD$y) plot(cvfit_nci) ## Lasso cross validation curve fitall_nci <- cvfit_nci$fit beta_nci <- fitall_nci$beta[,cvfit_nci$min] beta_nci[!beta_nci==0]