<- read.table('data/Pantanillos.txt', header = T, sep = '') data
7 Validación de Modelos
7.1 Práctico
7.1.1 Explorar datos
Lectura de Datos
XUTM | YUTM | LAT | LONG | BIOMAS | TM1 | TM2 | TM3 | TM4 | TM5 | TM7 | BRIGHT | GREEN | MOIST | NDVI | NDVIC | PEND | ALT | H |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
744797.3 | 6071183 | -35.473 | -72.302 | 164.649 | 43 | 29 | 23 | 57 | 26 | 15 | 83.854 | -0.004 | -4.548 | 0.425 | 0.321 | 6.546 | 472.423 | 17.106 |
745007.3 | 6071183 | -35.473 | -72.300 | 88.663 | 48 | 36 | 37 | 66 | 53 | 32 | 109.035 | -9.390 | -29.607 | 0.282 | 0.141 | 17.793 | 491.138 | 2.838 |
744587.3 | 6071243 | -35.472 | -72.304 | 127.707 | 42 | 31 | 22 | 54 | 29 | 17 | 82.817 | -2.611 | -8.038 | 0.421 | 0.306 | 10.933 | 466.259 | 9.288 |
744797.3 | 6071363 | -35.471 | -72.302 | 203.209 | 43 | 29 | 23 | 62 | 28 | 15 | 87.794 | 3.431 | -5.745 | 0.459 | 0.338 | 15.306 | 497.456 | 13.273 |
745007.3 | 6071363 | -35.471 | -72.300 | 101.979 | 43 | 33 | 24 | 84 | 40 | 20 | 108.640 | 15.278 | -15.202 | 0.556 | 0.347 | 28.135 | 471.417 | 12.141 |
744587.3 | 6071423 | -35.471 | -72.304 | 185.372 | 44 | 29 | 22 | 53 | 27 | 16 | 81.422 | -2.956 | -5.942 | 0.413 | 0.308 | 18.362 | 484.353 | 9.874 |
Filtrar los datos
<- data[,-c(1:4)] data2
BIOMAS | TM1 | TM2 | TM3 | TM4 | TM5 | TM7 | BRIGHT | GREEN | MOIST | NDVI | NDVIC | PEND | ALT | H |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
164.649 | 43 | 29 | 23 | 57 | 26 | 15 | 83.854 | -0.004 | -4.548 | 0.425 | 0.321 | 6.546 | 472.423 | 17.106 |
88.663 | 48 | 36 | 37 | 66 | 53 | 32 | 109.035 | -9.390 | -29.607 | 0.282 | 0.141 | 17.793 | 491.138 | 2.838 |
127.707 | 42 | 31 | 22 | 54 | 29 | 17 | 82.817 | -2.611 | -8.038 | 0.421 | 0.306 | 10.933 | 466.259 | 9.288 |
203.209 | 43 | 29 | 23 | 62 | 28 | 15 | 87.794 | 3.431 | -5.745 | 0.459 | 0.338 | 15.306 | 497.456 | 13.273 |
101.979 | 43 | 33 | 24 | 84 | 40 | 20 | 108.640 | 15.278 | -15.202 | 0.556 | 0.347 | 28.135 | 471.417 | 12.141 |
185.372 | 44 | 29 | 22 | 53 | 27 | 16 | 81.422 | -2.956 | -5.942 | 0.413 | 0.308 | 18.362 | 484.353 | 9.874 |
Histograma de variable respuesta
# histograma de variable respuesta
hist(data$BIOMAS)
7.2 Definición de Funciones
Función para scatterplot
# funcion para scatterplot
<- function(obs, pred, main=''){
plot_prediction <- cbind(obs, pred)
x plot(x, xlab = 'Biomasa observada [kg m-2]', ylab = 'Biomasa predicha [kg m-2]', pch=16, las=1,
xlim = c(min(x), max(x)), ylim = c(min(x), max(x)), main=main, col=rgb(0,0,0,0.2), cex=1.5)
abline(0,1, lty=2)
<- lm(pred ~ obs-1)
lm abline(lm)
legend('topleft', legend=c('Linea 1:1', 'Linea ajustada'), lty=c(1,2), bty = 'n')
# agregar metricas de ajuste de modelo
<- round( (cor(pred, obs)^2), 2) # solo dos decimales con funcion round
r2 <- round((sqrt(mean((obs - pred)^2))/(max(obs) - min(obs))) * 100, 2) # RMSE normalizado
NRMSE = round( (1-coef(lm))*-1, 2)
bias mtext(bquote(paste(r^2 == .(r2), ",", " %RMSE" == .(NRMSE), "%", ",", " bias" ==
side = 3, line = 0.5, adj = 0, font = 2)
.(bias))),
}
Visualización de Residuos
<- function(res, pred, main='Modelo'){
plot_residuos par(mfrow=c(1,2))
plot(pred, res, xlab = 'Biomasa predicha [kg m-2]', ylab = 'Residuos [kg m-2]', pch=16, las=1,
main=main, col=rgb(0,0,0,0.2), cex=1.5)
abline(h=0, lty=2)
qqnorm(res)
qqline(res, lty=2)
}
7.3 Analisis
7.3.1 Análisis de Significancia de Corrección
<- cor(data2)
corr = cor.mtest(data2, conf.level = 0.95) # matriz con valores de p
testRes par(mfrow=c(1,1))
corrplot(corr, p.mat = testRes$p, type = 'lower', diag = FALSE) # X a las no sign. con p > 0.05
cor(data2) # mejor variable segun cor es one_mean
BIOMAS TM1 TM2 TM3 TM4 TM5
BIOMAS 1.00000000 -0.31361610 -0.3657966 -0.27179254 -0.46393079 -0.3594047
TM1 -0.31361610 1.00000000 0.9620088 0.95921101 0.09765849 0.9063002
TM2 -0.36579661 0.96200881 1.0000000 0.96123400 0.22164639 0.9364134
TM3 -0.27179254 0.95921101 0.9612340 1.00000000 0.04443123 0.9426152
TM4 -0.46393079 0.09765849 0.2216464 0.04443123 1.00000000 0.1900475
TM5 -0.35940473 0.90630018 0.9364134 0.94261521 0.19004752 1.0000000
TM7 -0.29321630 0.92326124 0.9328171 0.96913153 0.04712320 0.9762178
BRIGHT -0.46834630 0.87680231 0.9366557 0.87721991 0.50462364 0.9265610
GREEN 0.06247448 -0.84581862 -0.7924929 -0.89542924 0.39613113 -0.7977948
MOIST 0.32239600 -0.89367594 -0.9159402 -0.94344351 -0.10547874 -0.9939164
NDVI 0.11752383 -0.87593057 -0.8298394 -0.93068575 0.31098157 -0.8262571
NDVIC 0.27767911 -0.91453558 -0.8983385 -0.95949090 0.08793446 -0.9193543
PEND 0.10787286 -0.29088550 -0.2667467 -0.30306962 0.05054119 -0.1913806
ALT 0.06590560 -0.34638456 -0.2674291 -0.30541545 0.01988928 -0.2707978
H 0.60544860 -0.47942172 -0.4819355 -0.43697984 -0.31427651 -0.4823196
TM7 BRIGHT GREEN MOIST NDVI NDVIC
BIOMAS -0.2932163 -0.4683463 0.06247448 0.3223960 0.1175238 0.27767911
TM1 0.9232612 0.8768023 -0.84581862 -0.8936759 -0.8759306 -0.91453558
TM2 0.9328171 0.9366557 -0.79249292 -0.9159402 -0.8298394 -0.89833845
TM3 0.9691315 0.8772199 -0.89542924 -0.9434435 -0.9306857 -0.95949090
TM4 0.0471232 0.5046236 0.39613113 -0.1054787 0.3109816 0.08793446
TM5 0.9762178 0.9265610 -0.79779485 -0.9939164 -0.8262571 -0.91935429
TM7 1.0000000 0.8752779 -0.88290983 -0.9885743 -0.8979835 -0.94375520
BRIGHT 0.8752779 1.0000000 -0.59146405 -0.8891844 -0.6518289 -0.79086476
GREEN -0.8829098 -0.5914641 1.00000000 0.8369327 0.9869514 0.92129745
MOIST -0.9885743 -0.8891844 0.83693272 1.0000000 0.8556265 0.92978569
NDVI -0.8979835 -0.6518289 0.98695136 0.8556265 1.0000000 0.95917534
NDVIC -0.9437552 -0.7908648 0.92129745 0.9297857 0.9591753 1.00000000
PEND -0.2282007 -0.2052176 0.27841863 0.1905210 0.3166394 0.28910470
ALT -0.2832766 -0.2524166 0.28856127 0.2690036 0.3035372 0.30633676
H -0.4304685 -0.5347813 0.27232831 0.4517838 0.3362205 0.46368280
PEND ALT H
BIOMAS 0.10787286 0.06590560 0.60544860
TM1 -0.29088550 -0.34638456 -0.47942172
TM2 -0.26674670 -0.26742907 -0.48193548
TM3 -0.30306962 -0.30541545 -0.43697984
TM4 0.05054119 0.01988928 -0.31427651
TM5 -0.19138064 -0.27079776 -0.48231958
TM7 -0.22820067 -0.28327655 -0.43046847
BRIGHT -0.20521762 -0.25241661 -0.53478130
GREEN 0.27841863 0.28856127 0.27232831
MOIST 0.19052104 0.26900363 0.45178385
NDVI 0.31663942 0.30353719 0.33622053
NDVIC 0.28910470 0.30633676 0.46368280
PEND 1.00000000 0.19853031 0.38709465
ALT 0.19853031 1.00000000 0.07529796
H 0.38709465 0.07529796 1.00000000
#corr
7.3.2 Validación Cruzada
# separar en set de validacion y entrenamiento: Validacion cruzada simple!
set.seed(1234)
<- createDataPartition(data2$BIOMAS, p = 0.6, list = F)
idx length(idx)
[1] 56
%>% head() idx
Resample1
[1,] 1
[2,] 5
[3,] 8
[4,] 9
[5,] 11
[6,] 12
# 91*0.6 --> 54.6
# particionar los datos usando los datos generados
<- data2[idx, ] # 60% de los datos
entrenar <- data2[-idx, ] # 40% de los datos validar
7.4 Regresiones
7.4.1 Regresiones Simples
<- lm(BIOMAS ~ H, data = entrenar)
lm1 summary(lm1)
Call:
lm(formula = BIOMAS ~ H, data = entrenar)
Residuals:
Min 1Q Median 3Q Max
-161.91 -31.59 -12.85 18.43 206.26
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 32.623 11.265 2.896 0.00545 **
H 6.921 1.430 4.840 1.12e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 56.64 on 54 degrees of freedom
Multiple R-squared: 0.3026, Adjusted R-squared: 0.2897
F-statistic: 23.43 on 1 and 54 DF, p-value: 1.125e-05
Referecias para explicar los modelos en R: (“Explaining the Lm() Summary in R – Learn by Marketing” n.d.)
effect_plot(lm1, pred = H, interval = TRUE, plot.points = TRUE,
jitter = 0.05)
<- glm(BIOMAS ~ H, data = entrenar, family = Gamma(link = "log"))
lm2 summary(lm2)
Call:
glm(formula = BIOMAS ~ H, family = Gamma(link = "log"), data = entrenar)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.8159 -0.8806 -0.1909 0.3185 2.1156
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.58884 0.17418 20.605 < 2e-16 ***
H 0.09849 0.02211 4.456 4.25e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for Gamma family taken to be 0.7669877)
Null deviance: 58.130 on 55 degrees of freedom
Residual deviance: 43.909 on 54 degrees of freedom
AIC: 580.9
Number of Fisher Scoring iterations: 7
effect_plot(lm2, pred = H, interval = TRUE, plot.points = TRUE,
jitter = 0.05)
7.4.2 Regresión Multiple
## Regression multiple con todas las variables -----------------------------
<- lm(BIOMAS ~., data = entrenar)
lm3 summary(lm3)
Call:
lm(formula = BIOMAS ~ ., data = entrenar)
Residuals:
Min 1Q Median 3Q Max
-100.67 -31.30 -8.14 21.58 182.80
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.993e+02 5.815e+02 0.859 0.395
TM1 9.320e+00 2.906e+01 0.321 0.750
TM2 1.533e+01 2.822e+01 0.543 0.590
TM3 1.866e+01 3.699e+01 0.504 0.617
TM4 -4.582e+00 3.919e+01 -0.117 0.908
TM5 -1.410e+00 3.274e+01 -0.043 0.966
TM7 -3.049e+00 2.650e+01 -0.115 0.909
BRIGHT -1.498e+01 5.023e+01 -0.298 0.767
GREEN 2.388e+01 4.312e+01 0.554 0.583
MOIST -1.232e+01 4.067e+01 -0.303 0.764
NDVI -1.204e+03 1.728e+03 -0.697 0.490
NDVIC 1.213e+03 8.425e+02 1.440 0.157
PEND -2.048e-01 1.008e+00 -0.203 0.840
ALT 1.759e-02 1.066e-01 0.165 0.870
H 3.802e+00 2.285e+00 1.664 0.104
Residual standard error: 57.21 on 41 degrees of freedom
Multiple R-squared: 0.4599, Adjusted R-squared: 0.2755
F-statistic: 2.494 on 14 and 41 DF, p-value: 0.01162
effect_plot(lm3, pred = H, interval = TRUE, plot.points = TRUE,
jitter = 0.05)
<- glm(BIOMAS ~., data = entrenar, family = Gamma(link = log))
lm4 summary(lm4)
Call:
glm(formula = BIOMAS ~ ., family = Gamma(link = log), data = entrenar)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.4922 -0.7954 -0.1666 0.4182 1.5964
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.099546 9.159715 1.212 0.2325
TM1 0.437576 0.457743 0.956 0.3447
TM2 0.384137 0.444555 0.864 0.3926
TM3 0.448039 0.582720 0.769 0.4464
TM4 -0.035895 0.617363 -0.058 0.9539
TM5 0.027237 0.515759 0.053 0.9581
TM7 -0.018001 0.417409 -0.043 0.9658
BRIGHT -0.503779 0.791271 -0.637 0.5279
GREEN 0.627758 0.679168 0.924 0.3607
MOIST -0.271592 0.640674 -0.424 0.6738
NDVI -20.985287 27.223143 -0.771 0.4452
NDVIC 15.108460 13.270804 1.138 0.2615
PEND 0.001204 0.015875 0.076 0.9399
ALT -0.001040 0.001679 -0.619 0.5391
H 0.080295 0.035993 2.231 0.0312 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for Gamma family taken to be 0.8120839)
Null deviance: 58.13 on 55 degrees of freedom
Residual deviance: 36.52 on 41 degrees of freedom
AIC: 595.41
Number of Fisher Scoring iterations: 14
effect_plot(lm4, pred = H, interval = TRUE, plot.points = TRUE,
jitter = 0.05)
7.4.3 Regresión multiple con seleccion de variables stepwise
Utilizando train()
utilizanndo el método “lmStepAIC”, que va a buscar las conbinaciones de modelos lineales, simples, luego con dos, tres, etc., bajo el criterio AIC
. (Kuhn n.d.)
<- train(BIOMAS ~., data = entrenar, method = "lmStepAIC")
lm5 $finalModel lm5
El objeto caret lm5
guarda perfectamente cual es el mejor modelo, y se puede usar directamente para predecir, etc. Pero no se lleva bein caret con las funciones anova, AIC, así que vamos a usar las mejores variables y a hacer otro modelo
<- lm(BIOMAS ~ TM3 + BRIGHT + NDVIC + H, data = entrenar)
lm5 summary(lm5)
Call:
lm(formula = BIOMAS ~ TM3 + BRIGHT + NDVIC + H, data = entrenar)
Residuals:
Min 1Q Median 3Q Max
-99.37 -30.51 -13.02 21.00 181.98
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 105.909 147.888 0.716 0.47717
TM3 10.183 4.256 2.393 0.02045 *
BRIGHT -4.611 1.340 -3.441 0.00116 **
NDVIC 487.008 305.084 1.596 0.11660
H 3.954 1.717 2.303 0.02538 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 52.46 on 51 degrees of freedom
Multiple R-squared: 0.435, Adjusted R-squared: 0.3907
F-statistic: 9.818 on 4 and 51 DF, p-value: 5.74e-06
effect_plot(lm5, pred = H, interval = TRUE, plot.points = TRUE,
jitter = 0.05)
Family: Gamma(link = log)
<- train(BIOMAS ~., data = entrenar, method = "glmStepAIC", family = Gamma(link = log))
lm6 $finalModel lm6
<- glm(BIOMAS ~ TM3 + BRIGHT + H, data = entrenar, family = Gamma(link = "log"))
lm6
effect_plot(lm6, pred = H, interval = TRUE, plot.points = TRUE,
jitter = 0.05)
7.5 Información de modelos
summary(lm1)
Call:
lm(formula = BIOMAS ~ H, data = entrenar)
Residuals:
Min 1Q Median 3Q Max
-161.91 -31.59 -12.85 18.43 206.26
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 32.623 11.265 2.896 0.00545 **
H 6.921 1.430 4.840 1.12e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 56.64 on 54 degrees of freedom
Multiple R-squared: 0.3026, Adjusted R-squared: 0.2897
F-statistic: 23.43 on 1 and 54 DF, p-value: 1.125e-05
summary(lm2)
Call:
glm(formula = BIOMAS ~ H, family = Gamma(link = "log"), data = entrenar)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.8159 -0.8806 -0.1909 0.3185 2.1156
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.58884 0.17418 20.605 < 2e-16 ***
H 0.09849 0.02211 4.456 4.25e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for Gamma family taken to be 0.7669877)
Null deviance: 58.130 on 55 degrees of freedom
Residual deviance: 43.909 on 54 degrees of freedom
AIC: 580.9
Number of Fisher Scoring iterations: 7
summary(lm3)
Call:
lm(formula = BIOMAS ~ ., data = entrenar)
Residuals:
Min 1Q Median 3Q Max
-100.67 -31.30 -8.14 21.58 182.80
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.993e+02 5.815e+02 0.859 0.395
TM1 9.320e+00 2.906e+01 0.321 0.750
TM2 1.533e+01 2.822e+01 0.543 0.590
TM3 1.866e+01 3.699e+01 0.504 0.617
TM4 -4.582e+00 3.919e+01 -0.117 0.908
TM5 -1.410e+00 3.274e+01 -0.043 0.966
TM7 -3.049e+00 2.650e+01 -0.115 0.909
BRIGHT -1.498e+01 5.023e+01 -0.298 0.767
GREEN 2.388e+01 4.312e+01 0.554 0.583
MOIST -1.232e+01 4.067e+01 -0.303 0.764
NDVI -1.204e+03 1.728e+03 -0.697 0.490
NDVIC 1.213e+03 8.425e+02 1.440 0.157
PEND -2.048e-01 1.008e+00 -0.203 0.840
ALT 1.759e-02 1.066e-01 0.165 0.870
H 3.802e+00 2.285e+00 1.664 0.104
Residual standard error: 57.21 on 41 degrees of freedom
Multiple R-squared: 0.4599, Adjusted R-squared: 0.2755
F-statistic: 2.494 on 14 and 41 DF, p-value: 0.01162
summary(lm4)
Call:
glm(formula = BIOMAS ~ ., family = Gamma(link = log), data = entrenar)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.4922 -0.7954 -0.1666 0.4182 1.5964
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.099546 9.159715 1.212 0.2325
TM1 0.437576 0.457743 0.956 0.3447
TM2 0.384137 0.444555 0.864 0.3926
TM3 0.448039 0.582720 0.769 0.4464
TM4 -0.035895 0.617363 -0.058 0.9539
TM5 0.027237 0.515759 0.053 0.9581
TM7 -0.018001 0.417409 -0.043 0.9658
BRIGHT -0.503779 0.791271 -0.637 0.5279
GREEN 0.627758 0.679168 0.924 0.3607
MOIST -0.271592 0.640674 -0.424 0.6738
NDVI -20.985287 27.223143 -0.771 0.4452
NDVIC 15.108460 13.270804 1.138 0.2615
PEND 0.001204 0.015875 0.076 0.9399
ALT -0.001040 0.001679 -0.619 0.5391
H 0.080295 0.035993 2.231 0.0312 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for Gamma family taken to be 0.8120839)
Null deviance: 58.13 on 55 degrees of freedom
Residual deviance: 36.52 on 41 degrees of freedom
AIC: 595.41
Number of Fisher Scoring iterations: 14
summary(lm5)
Call:
lm(formula = BIOMAS ~ TM3 + BRIGHT + NDVIC + H, data = entrenar)
Residuals:
Min 1Q Median 3Q Max
-99.37 -30.51 -13.02 21.00 181.98
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 105.909 147.888 0.716 0.47717
TM3 10.183 4.256 2.393 0.02045 *
BRIGHT -4.611 1.340 -3.441 0.00116 **
NDVIC 487.008 305.084 1.596 0.11660
H 3.954 1.717 2.303 0.02538 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 52.46 on 51 degrees of freedom
Multiple R-squared: 0.435, Adjusted R-squared: 0.3907
F-statistic: 9.818 on 4 and 51 DF, p-value: 5.74e-06
summary(lm6)
Call:
glm(formula = BIOMAS ~ TM3 + BRIGHT + H, family = Gamma(link = "log"),
data = entrenar)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.7612 -0.7162 -0.1862 0.3547 1.6836
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.79928 1.37938 4.929 8.82e-06 ***
TM3 0.05952 0.02484 2.396 0.02021 *
BRIGHT -0.04705 0.01781 -2.641 0.01088 *
H 0.07821 0.02458 3.182 0.00247 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for Gamma family taken to be 0.6614027)
Null deviance: 58.130 on 55 degrees of freedom
Residual deviance: 39.293 on 52 degrees of freedom
AIC: 577.95
Number of Fisher Scoring iterations: 6
Plot de diagnostico
par(mfrow=c(2,2))
plot(lm1)
plot(lm2)
plot(lm3)
plot(lm4)
plot(lm5)
plot(lm6)
7.6 Validación Independientes
7.6.1 Predicción
<- predict(lm1, validar, type = 'response')
pred1 <- predict(lm2, validar, type = 'response')
pred2 <- predict(lm3, validar, type = 'response')
pred3 <- predict(lm4, validar, type = 'response')
pred4 <- predict(lm5, validar, type = 'response')
pred5 <- predict(lm6, validar, type = 'response') pred6
7.6.2 R^2
postResample(validar$BIOMAS, pred1)
RMSE Rsquared MAE
44.4093971 0.4945806 36.5128797
postResample(validar$BIOMAS, pred2)
RMSE Rsquared MAE
47.5393225 0.4190765 37.4912978
postResample(validar$BIOMAS, pred3)
RMSE Rsquared MAE
41.9033685 0.5493197 33.2810130
postResample(validar$BIOMAS, pred4)
RMSE Rsquared MAE
51.073834 0.503803 36.899949
postResample(validar$BIOMAS, pred5)
RMSE Rsquared MAE
42.2525620 0.5466302 35.0206725
postResample(validar$BIOMAS, pred6)
RMSE Rsquared MAE
44.5042861 0.5289186 32.5334253
7.6.3 Plot de valores predichos vs observados
par(mfrow=c(1,2))
plot_prediction(validar$BIOMAS, pred1, main='lm simple') # nuestra funcion
plot_prediction(validar$BIOMAS, pred2, main='GLM simple')
plot_prediction(validar$BIOMAS, pred3, main='lm todo')
plot_prediction(validar$BIOMAS, pred4, main='GLM todo')
plot_prediction(validar$BIOMAS, pred5, main='lm seleccion')
plot_prediction(validar$BIOMAS, pred6, main='GLM deleccion')
7.6.4 Residuos
<- validar$BIOMAS - pred1
res1 <- validar$BIOMAS - pred2
res2 <- validar$BIOMAS - pred3
res3 <- validar$BIOMAS - pred4
res4 <- validar$BIOMAS - pred5
res5 <- validar$BIOMAS - pred6 res6
7.6.5 Plot de residuos
Los observados vs los predichos
plot_residuos(res1, pred1) # nuestra funcion
plot_residuos(res2, pred2)
plot_residuos(res3, pred3)
plot_residuos(res4, pred4)
plot_residuos(res5, pred5)
plot_residuos(res6, pred6)
7.6.6 Histrogramas
par(mfrow=c(1,2))
hist(res1, main = 'lm simple')
hist(res2, main = 'GLM simple')
hist(res3, main = 'lm todo')
hist(res4, main = 'GLM todo')
hist(res5, main = 'lm seleccion')
hist(res6, main = 'GLM seleccion')
7.6.7 Shapiro Test
H0 -> dist. normal H1 -> dist. no normal
shapiro.test(res1)
Shapiro-Wilk normality test
data: res1
W = 0.96422, p-value = 0.3049
shapiro.test(res2)
Shapiro-Wilk normality test
data: res2
W = 0.94851, p-value = 0.102
shapiro.test(res3)
Shapiro-Wilk normality test
data: res3
W = 0.97745, p-value = 0.6747
shapiro.test(res4) # no normal
Shapiro-Wilk normality test
data: res4
W = 0.90988, p-value = 0.007368
shapiro.test(res5)
Shapiro-Wilk normality test
data: res5
W = 0.97574, p-value = 0.6182
shapiro.test(res6) # no normal
Shapiro-Wilk normality test
data: res6
W = 0.92007, p-value = 0.01432
7.6.8 Significancia de las diferencias
anova(lm1,lm2,lm3,lm4,lm5, lm6) # diferencias entre gaussianos
Analysis of Variance Table
Model 1: BIOMAS ~ H
Model 2: BIOMAS ~ H
Model 3: BIOMAS ~ TM1 + TM2 + TM3 + TM4 + TM5 + TM7 + BRIGHT + GREEN +
MOIST + NDVI + NDVIC + PEND + ALT + H
Model 4: BIOMAS ~ TM1 + TM2 + TM3 + TM4 + TM5 + TM7 + BRIGHT + GREEN +
MOIST + NDVI + NDVIC + PEND + ALT + H
Model 5: BIOMAS ~ TM3 + BRIGHT + NDVIC + H
Model 6: BIOMAS ~ TM3 + BRIGHT + H
Res.Df RSS Df Sum of Sq F Pr(>F)
1 54 173261
2 54 44 0 173217
3 41 134181 13 -134137
4 41 37 0 134144
5 51 140355 -10 -140319 4.2876 0.0004042 ***
6 52 39 -1 140316
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Sería al menos 1 es diferentes o significato, no necesariamente es el 5
AIC(lm1, lm2, lm3, lm4, lm5, lm6)
df AIC
lm1 3 615.0045
lm2 3 580.9045
lm3 16 626.6902
lm4 16 595.4066
lm5 6 609.2097
lm6 5 577.9501
Se puede estar en desacuerdo con esto, todo depende de su objetivo, el modelo 6 (lm6
) es parsimonioso, pero no el que da mejores resultados.
RSS: Df:
7.7 Mejor modelo con re-muestreo
usamos caret para definir un tipo de metodo de re-muestreo para entrenar modelos mas robustos.
savePredictions = TRUE
= nos va a guardar los daos internamente en el modelo validacion K-fold, con K=5, y 5 repeticiones aleatoreas
<- trainControl(method = "repeatedcv",
control number = 5, # K
repeats = 5,
savePredictions = TRUE)
Usar mejor modelos agregamos el parametro trControl que nos permite pasar la informacion del tipo de vadilacion cruzada a usar todos los datos, no solo los de entrenar, ya que las particiones se hacen internamente.
7.7.1 Entrenar Modelos con cv
Modelos 7
## efecto cantidad de variables en validaciones mas robustas!
<- train(BIOMAS ~.,
lm7 method = "glm",
data = data2,
family = Gamma(link = log),
trControl = control)
summary(lm7)
Call:
NULL
Deviance Residuals:
Min 1Q Median 3Q Max
-1.5663 -0.8534 -0.1446 0.3204 1.6855
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.765e+00 5.666e+00 1.370 0.174567
TM1 4.150e-01 3.397e-01 1.222 0.225644
TM2 2.603e-01 3.413e-01 0.763 0.448091
TM3 4.301e-01 4.224e-01 1.018 0.311786
TM4 -3.962e-01 4.897e-01 -0.809 0.420965
TM5 1.045e-01 3.933e-01 0.266 0.791257
TM7 9.071e-02 3.147e-01 0.288 0.773926
BRIGHT -2.014e-01 6.218e-01 -0.324 0.746916
GREEN 8.252e-01 5.001e-01 1.650 0.103039
MOIST -7.137e-02 4.807e-01 -0.148 0.882373
NDVI -1.987e+01 1.913e+01 -1.039 0.302292
NDVIC 1.530e+01 1.014e+01 1.508 0.135579
PEND -7.538e-03 1.199e-02 -0.629 0.531550
ALT -2.434e-04 1.169e-03 -0.208 0.835610
H 8.856e-02 2.510e-02 3.528 0.000714 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for Gamma family taken to be 0.7076406)
Null deviance: 95.048 on 90 degrees of freedom
Residual deviance: 55.750 on 76 degrees of freedom
AIC: 937.46
Number of Fisher Scoring iterations: 11
Modelo 6
<- train(BIOMAS ~ TM3 + BRIGHT + H, # los determinados por el train de lm6
lm8 method = "glm",
data = data2,
family = Gamma(link = log),
trControl = control)
summary(lm8)
Call:
NULL
Deviance Residuals:
Min 1Q Median 3Q Max
-1.7172 -0.8079 -0.1219 0.3163 1.8271
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.56495 1.02006 6.436 6.46e-09 ***
TM3 0.05896 0.01869 3.154 0.002210 **
BRIGHT -0.04554 0.01333 -3.417 0.000965 ***
H 0.08557 0.01904 4.495 2.14e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for Gamma family taken to be 0.6453826)
Null deviance: 95.048 on 90 degrees of freedom
Residual deviance: 61.377 on 87 degrees of freedom
AIC: 925.12
Number of Fisher Scoring iterations: 6
7.7.2 Datos de ajuste internos de cada iteración
$resample lm7
RMSE Rsquared MAE Resample
1 42.08596 0.55062712 34.34071 Fold1.Rep1
2 164.24473 0.11760861 93.04358 Fold2.Rep1
3 62.70291 0.36601711 44.95314 Fold3.Rep1
4 53.79932 0.25253642 39.66982 Fold4.Rep1
5 82.24359 0.20767591 60.84382 Fold5.Rep1
6 64.45791 0.22995763 39.02619 Fold1.Rep2
7 170.00849 0.07854019 82.37425 Fold2.Rep2
8 43.10085 0.42965684 35.63650 Fold3.Rep2
9 39.07610 0.57550108 33.40203 Fold4.Rep2
10 89.18437 0.13761200 72.19029 Fold5.Rep2
11 64.32318 0.54899375 42.86009 Fold1.Rep3
12 73.58456 0.40000220 54.21211 Fold2.Rep3
13 46.76064 0.54345393 38.59701 Fold3.Rep3
14 31.23562 0.64675677 26.76364 Fold4.Rep3
15 158.51760 0.17864936 89.69632 Fold5.Rep3
16 152.52734 0.15959130 77.79163 Fold1.Rep4
17 41.18393 0.48586434 29.45265 Fold2.Rep4
18 61.27450 0.45782888 38.05745 Fold3.Rep4
19 66.67338 0.31925216 49.10811 Fold4.Rep4
20 52.02723 0.48809468 40.18278 Fold5.Rep4
21 48.76703 0.52934030 37.69387 Fold1.Rep5
22 64.96093 0.26812830 45.72222 Fold2.Rep5
23 61.23993 0.39640010 42.70253 Fold3.Rep5
24 110.39498 0.07262132 63.72854 Fold4.Rep5
25 55.79810 0.49990714 41.85867 Fold5.Rep5
7.7.3 Visualización
Visualización de R^2
par(mfrow=c(1,2))
boxplot(lm7$resample[,2], main ='R2 todo', ylim = c(0,1))
abline(h = median(lm7$resample[,2]), lty = 2, col = "red")
boxplot(lm8$resample[,2], main ='R2 (TM3 + BRIGHT + H)', ylim = c(0,1))
abline(h = median(lm8$resample[,2]), lty = 2, col = "red")
Visualización de RMSE
par(mfrow=c(1,2))
boxplot(lm7$resample[,1], main ='RMSE todo', ylim = c(20,120))
abline(h = median(lm7$resample[,1]), lty = 2, col = "red")
boxplot(lm8$resample[,1], main ='RMSE (TM3 + BRIGHT + H)', ylim = c(20,120))
abline(h = median(lm8$resample[,1]), lty = 2, col = "red")
En este caso tenemos una sitribución de datos
7.7.4 Observados vs predichos
# lm7$pred
par(mfrow=c(1,2))
<- lm7$pred[,2]
obs7 <- lm7$pred[,1]
pred7
plot(obs7, pred7, pch = 16, col = rgb(0,0.5,0,0.3)) # red, green, blue, alpha
abline(0, 1, lty = 2, col = "red") # intercepto, pendiente
<- lm(pred7 ~ obs7 - 1)
lm abline(lm)
legend('topleft', legend=c('Linea 1:1', 'Linea ajustada'), lty = c(2,1),
col = c("red", "black"), bty = 'n')
<- lm8$pred[,2]
obs8 <- lm8$pred[,1]
pred8
plot(obs8, pred8, pch = 17, col = rgb(0,0,0.5,0.3))
abline(0, 1, lty = 2, col = "red")
<- lm(pred8 ~ obs8 - 1)
lm abline(lm)
legend('topleft', legend=c('Linea 1:1', 'Linea ajustada'), lty = c(2,1),
col = c("red", "black"), bty = 'n')
EN el Eje x son fijas las predicciones, mientras que en las prediccionesn pueden variar.