** Some parts of this script may not be optimized as they are done in live tutorial, only **
apparts<-read.csv("https://chamroukhi.com/data/apparts_Paris.csv", header=TRUE, sep=",", dec=".")
apparts
## surfaces prices
## 1 28 130
## 2 50 280
## 3 55 268
## 4 110 500
## 5 60 320
## 6 48 250
## 7 90 378
## 8 35 250
## 9 86 350
## 10 65 300
## 11 32 155
## 12 52 245
## 13 40 200
## 14 70 325
## 15 28 85
## 16 30 78
## 17 105 375
## 18 52 200
## 19 80 270
## 20 20 85
surfaces <- apparts$surfaces
prices <- apparts$prices
summary(surfaces)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.00 34.25 52.00 56.80 72.50 110.00
summary(prices)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 78.0 188.8 259.0 252.2 321.2 500.0
plot(surfaces, prices, xlab= "surface in m^2",ylab = "price in K euros")
# Linear Regression
x <- surfaces
y <- prices
lr_fit = lm(y ~ x)
summary(lr_fit)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71.469 -27.633 4.748 24.960 81.682
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.6438 24.4450 1.376 0.186
## x 3.8478 0.3922 9.811 1.2e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.02 on 18 degrees of freedom
## Multiple R-squared: 0.8425, Adjusted R-squared: 0.8337
## F-statistic: 96.26 on 1 and 18 DF, p-value: 1.197e-08
y_hat = lr_fit$fitted.values
beta_0 = lr_fit$coefficients[1]
print(beta_0)
## (Intercept)
## 33.64382
beta_1 = lr_fit$coefficients[2]
print(beta_1)
## x
## 3.84782
plot(x, y, xlab= "surface in m^2",ylab = "price in K euros")
lines(x, y_hat, col="red")
#lines(x, beta_0 + beta_1*x , col="blue")
#legend("topright", legend=c("data", "fitted model"),col=c("black", "red"), lty=1:2, cex=0.8)
summary(lr_fit)$r.squared
## [1] 0.8424632
alpha = 0.05
confint(lr_fit, level = 1 - alpha)
## 2.5 % 97.5 %
## (Intercept) -17.713158 85.000790
## x 3.023864 4.671776
%{r, include=TRUE, echo=TRUE} %n = length(y) %qt(1 - alpha/2, n-2) # quantile of order alpha/2 of student t with n-2 df %
summary(lr_fit)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.64382 24.4449689 1.376308 1.856079e-01
## x 3.84782 0.3921877 9.811170 1.196624e-08
seqx <- seq(min(x),max(x),length=100)
#apparts <- data.frame(surfaces, prices)
apparts <- data.frame(x, y)
Conf_Int <- predict(lr_fit, data.frame(x = seqx), interval="confidence")[,c("lwr","upr")]
summary(Conf_Int)
## lwr upr
## Min. : 73.63 Min. :147.6
## 1st Qu.:172.97 1st Qu.:221.4
## Median :261.55 Median :306.0
## Mean :253.74 Mean :313.8
## 3rd Qu.:337.36 3rd Qu.:403.3
## Max. :408.23 Max. :505.6
plot(y~x, xlab= "surface in m^2",ylab = "price in K euros", cex=0.8) #,xlim = c(min(x),max(x)), ylim = c(min(y),max(y))
abline(lr_fit$coefficients[1],lr_fit$coefficients[2])
#matlines(seqx, cbind(Conf_Int, Pred_Int),lty=c(2,2,3,3), col=c("red","red","blue","blue"),lwd=c(2,2))
matlines(seqx, Conf_Int, lty=c(2,2), lwd=c(3,3), col=c("red","red"))
legend("bottomright",lty=c(1,3),lwd=c(1,2), c("Regression Line", paste(toString(1- alpha),"Confidence Interval")),col=c("black","red"),cex=0.8)
#lines(lr_fit$fitted.values)
#matlines(Pred_Int)
Pred_Int <- predict(lr_fit, data.frame(x = seqx), interval="prediction")[,c("lwr","upr")]
summary(Conf_Int)
## lwr upr
## Min. : 73.63 Min. :147.6
## 1st Qu.:172.97 1st Qu.:221.4
## Median :261.55 Median :306.0
## Mean :253.74 Mean :313.8
## 3rd Qu.:337.36 3rd Qu.:403.3
## Max. :408.23 Max. :505.6
plot(y~x, xlab= "surface in m^2",ylab = "price in K euros", cex=0.8)
abline(lr_fit$coefficients[1],lr_fit$coefficients[2])
matlines(seqx, cbind(Conf_Int, Pred_Int),lty=c(2,2,3,3), col=c("red","red","blue","blue"),lwd=c(2,2))
legend("bottomright",lty=c(2,3),lwd=c(2,1), c("Confidence Interval","Prediction Interval"),col=c("red","blue"),cex=0.8)
#lines(lr_fit$fitted.values)
#matlines(Pred_Int)
x0 <- 30
predict(lr_fit, data.frame(x = x0), interval="confidence")
## fit lwr upr
## 1 149.0784 118.5031 179.6537
predict(lr_fit, data.frame(x = x0), interval="prediction")
## fit lwr upr
## 1 149.0784 49.68257 248.4743
y_hat = lr_fit$fitted.values
residuals <- y - y_hat
#plot(x, residuals )
par(mfrow = c(1, 2))
plot(lr_fit$fitted.values, residuals, xlab = "fitted values", ylab = "residuals")
abline(0,0, col="black")
plot(lr_fit$fitted.values, y, xlab = "fitted values", ylab = "prices")
abline(0,1, col="black")
apparts<-read.csv("https://chamroukhi.com/data/apparts_Marseille.csv", header=TRUE, sep=",", dec=".")
apparts
## surfaces prices
## 1 62 190.8
## 2 68 245.0
## 3 67 200.0
## 4 76 350.0
## 5 90 274.0
## 6 67 242.0
## 7 90 420.0
## 8 60 195.0
## 9 81 220.0
## 10 90 265.0
## 11 70 182.0
## 12 63 208.0
## 13 80 223.0
## 14 84 252.0
## 15 73 343.0
## 16 70 199.0
## 17 80 230.0
## 18 78 245.0
## 19 63 222.0
## 20 66 185.0
surfaces <- apparts$surfaces
prices <- apparts$prices
summary(surfaces)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 60.00 66.75 71.50 73.90 80.25 90.00
summary(prices)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 182.0 199.8 226.5 244.5 255.2 420.0
plot(surfaces, prices, xlab= "surface in m^2",ylab = "price in K euros")
# Linear Regression
x <- surfaces
y <- prices
lr_fit = lm(y ~ x)
summary(lr_fit)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.38 -32.70 -16.95 18.33 116.86
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -24.42 90.90 -0.269 0.79124
## x 3.64 1.22 2.984 0.00796 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52.07 on 18 degrees of freedom
## Multiple R-squared: 0.3309, Adjusted R-squared: 0.2937
## F-statistic: 8.902 on 1 and 18 DF, p-value: 0.007965
y_hat = lr_fit$fitted.values
beta_0 = lr_fit$coefficients[1]
print(beta_0)
## (Intercept)
## -24.42063
beta_1 = lr_fit$coefficients[2]
print(beta_1)
## x
## 3.639521
plot(x, y, xlab= "surface in m^2",ylab = "price in K euros")
lines(x, y_hat, col="red")
#lines(x, beta_0 + beta_1*x , col="blue")
#legend("topright", legend=c("data", "fitted model"),col=c("black", "red"), lty=1:2, cex=0.8)
summary(lr_fit)$r.squared
## [1] 0.3308968
alpha = 0.05
confint(lr_fit, level = 1 - alpha)
## 2.5 % 97.5 %
## (Intercept) -215.3860 166.544792
## x 1.0767 6.202342
summary(lr_fit)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -24.420628 90.896005 -0.2686656 0.79124212
## x 3.639521 1.219855 2.9835680 0.00796485
seqx <- seq(min(x),max(x),length=100)
#apparts <- data.frame(surfaces, prices)
apparts <- data.frame(x, y)
Conf_Int <- predict(lr_fit, data.frame(x = seqx), interval="confidence")[,c("lwr","upr")]
summary(Conf_Int)
## lwr upr
## Min. :150.7 Min. :237.2
## 1st Qu.:191.8 1st Qu.:250.7
## Median :223.9 Median :273.2
## Mean :215.9 Mean :281.1
## 3rd Qu.:242.9 3rd Qu.:308.8
## Max. :255.2 Max. :351.1
plot(y~x, xlab= "surface in m^2",ylab = "price in K euros", cex=0.8) #,xlim = c(min(x),max(x)), ylim = c(min(y),max(y))
abline(lr_fit$coefficients[1],lr_fit$coefficients[2])
#matlines(seqx, cbind(Conf_Int, Pred_Int),lty=c(2,2,3,3), col=c("red","red","blue","blue"),lwd=c(2,2))
matlines(seqx, Conf_Int, lty=c(2,2), lwd=c(3,3), col=c("red","red"))
legend("bottomright",lty=c(1,3),lwd=c(1,2), c("Regression Line", paste(toString(1- alpha),"Confidence Interval")),col=c("black","red"),cex=0.8)
#lines(lr_fit$fitted.values)
#matlines(Pred_Int)
Pred_Int <- predict(lr_fit, data.frame(x = seqx), interval="prediction")[,c("lwr","upr")]
summary(Conf_Int)
## lwr upr
## Min. :150.7 Min. :237.2
## 1st Qu.:191.8 1st Qu.:250.7
## Median :223.9 Median :273.2
## Mean :215.9 Mean :281.1
## 3rd Qu.:242.9 3rd Qu.:308.8
## Max. :255.2 Max. :351.1
plot(y~x, xlab= "surface in m^2",ylab = "price in K euros", cex=0.8)
abline(lr_fit$coefficients[1],lr_fit$coefficients[2])
matlines(seqx, cbind(Conf_Int, Pred_Int),lty=c(2,2,3,3), col=c("red","red","blue","blue"),lwd=c(2,2))
legend("bottomright",lty=c(2,3),lwd=c(2,1), c("Confidence Interval","Prediction Interval"),col=c("red","blue"),cex=0.8)
x0 <- 50
predict(lr_fit, data.frame(x = x0), interval="confidence")
## fit lwr upr
## 1 157.5554 91.60076 223.5101
predict(lr_fit, data.frame(x = x0), interval="prediction")
## fit lwr upr
## 1 157.5554 29.82253 285.2884
y_hat = lr_fit$fitted.values
residuals <- y - y_hat
par(mfrow = c(1, 2))
plot(lr_fit$fitted.values, residuals, xlab = "fitted values", ylab = "residuals")
abline(0,0, col="black")
plot(lr_fit$fitted.values, y, xlab = "fitted values", ylab = "prices")
abline(0,1, col="black")
y_new = y[y<=300]
x_new = x[y<=300]
lr_fit_new = lm(y_new ~ x_new)
y_hat_new = lr_fit_new$fitted.values
beta_0 = lr_fit_new$coefficients[1]
beta_1 = lr_fit_new$coefficients[2]
summary(lr_fit_new)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.655234 36.9729083 1.802813 0.091541417
## x_new 2.134513 0.5030252 4.243351 0.000708159
summary(lr_fit_new)$r.squared
## [1] 0.5455375
confint(lr_fit_new)
## 2.5 % 97.5 %
## (Intercept) -12.15065 145.461122
## x_new 1.06234 3.206685
plot(x_new, y_new, xlab= "surface in m^2",ylab = "price in K euros")
lines(x_new, y_hat_new, col="red")
x0 <- 50
predict(lr_fit_new, data.frame(x_new = x0), interval="confidence")
## fit lwr upr
## 1 173.3809 146.8096 199.9521
predict(lr_fit_new, data.frame(x_new = x0), interval="prediction")
## fit lwr upr
## 1 173.3809 123.6207 223.141
Confidence Interval
seqx <- seq(min(x_new),max(x_new),length=1000)
#apparts <- data.frame(surfaces, prices)
apparts <- data.frame(x_new, y_new)
Pred_Int <- predict(lr_fit_new, data.frame(x_new = seqx), interval="prediction")[,c("lwr","upr")]
Conf_Int <- predict(lr_fit_new, data.frame(x_new = seqx), interval="confidence")[,c("lwr","upr")]
summary(Conf_Int)
## lwr upr
## Min. :177.6 Min. :211.9
## 1st Qu.:199.0 1st Qu.:222.5
## Median :216.3 Median :237.2
## Mean :213.1 Mean :240.4
## 3rd Qu.:228.2 3rd Qu.:257.3
## Max. :237.8 Max. :279.8
plot(y_new~x_new, xlab= "surface in m^2",ylab = "price in K euros", cex=0.8) #,xlim = c(min(x_new),max(x_new)), ylim = c(min(y_new),max(y_new))
abline(lr_fit_new$coefficients[1],lr_fit_new$coefficients[2])
matlines(seqx, cbind(Conf_Int, Pred_Int),lty=c(2,2,3,3), col=c("red","red","blue","blue"),lwd=c(2,2))
legend("bottomright",lty=c(2,3),lwd=c(2,1), c("Confidence Interval","Prediction Interval"),col=c("red","blue"),cex=0.8)
#lines(lr_fit_new$fitted.values)
#matlines(Pred_Int)
y_hat = lr_fit_new$fitted.values
residuals <- y_new - y_hat
par(mfrow = c(1, 2))
plot(lr_fit_new$fitted.values, residuals, xlab = "fitted values", ylab = "residuals")
abline(0,0, col="black")
plot(lr_fit_new$fitted.values, y_new, xlab = "fitted values", ylab = "prices")
abline(0,1, col="black")
X = matrix(c(surfaces, prices),ncol = 2)
K=2
sol = kmeans(X, K)
plot(X[,1],X[,2], col = factor(sol$cluster), xlab= "surface in m^2",ylab = "price in K euros", main = "kmeans clustering")