Carregamento de pacotes:
Neste exemplo são utilizados os seguintes pacotes: readr, dplyr, magrittr, tibble, broom, reshape2, ggplot2, plotly, lmtest e pROC. Para a confecção do documento, é necessário o pacote rmarkdown.
# readr -- para leitura de dados
if (!require('readr')) install.packages('readr'); library('readr')
# dplyr -- para facilitar manipulacao de dados
if (!require('dplyr')) install.packages('dplyr'); library('dplyr')
# magrittr -- para facilitar manipulacao de dados
if (!require('magrittr')) install.packages('magrittr'); library('magrittr')
# tibble -- para facilitar manipulacao de dados
if (!require('tibble')) install.packages('tibble'); library('tibble')
# reshape2 -- para facilitar manipulacao de dados
if (!require('reshape2')) install.packages('reshape2'); library('reshape2')
# broom -- para facilitar manipulacao de dados
if (!require('broom')) install.packages('broom'); library('broom')
# ggplot2 -- para os plots
if (!require('ggplot2')) install.packages('ggplot2'); library('ggplot2')
# plotly -- para os plots
if (!require('plotly')) install.packages('plotly'); library('plotly')
# # gridExtra -- para os plots
# if (!require('gridExtra')) install.packages('gridExtra'); library('gridExtra')
# lmtest -- para os testes de razao de verossimilhancas e para o teste de wald
if (!require('lmtest')) install.packages('lmtest'); library('lmtest')
# pROC -- para a curva ROC
if (!require('pROC')) install.packages('pROC'); library('pROC')
Carregamento dos dados:
Os dados utilizados neste exemplo são importados diretamente do repositório no GitHub e podem ser acessados no link abaixo:
#options(scipen=999)
crabs <- readr::read_table2("https://raw.githubusercontent.com/allanvc/categorical_data_analysis/master/exemplos/crabs/crabs.txt", col_names = FALSE)
crabs <- crabs %>%
magrittr::set_colnames(c("color", "spine", "width", "satell", "weight")) %>%
na.omit() %>%
dplyr::mutate(y = dplyr::if_else(satell > 0, 1, 0), # resposta
weight = weight/1000, # transformando peso em kg
color = color - 1) # para escala de color de 1 a 4
crabs
Frequência para as variáveis categóricas:
tab_freq <- sapply(crabs[, c("color", "spine", "satell", "y")], table)
tab_freq
## $color
##
## 1 2 3 4
## 12 95 44 22
##
## $spine
##
## 1 2 3
## 37 15 121
##
## $satell
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 14 15
## 62 16 9 19 19 15 13 4 6 3 3 1 1 1 1
##
## $y
##
## 0 1
## 62 111
Ajuste do modelo:
M0 <- glm(data=crabs, y ~ 1, family=binomial(link="logit"))
summary(M0)
##
## Call:
## glm(formula = y ~ 1, family = binomial(link = "logit"), data = crabs)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4326 -1.4326 0.9421 0.9421 0.9421
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.5824 0.1585 3.673 0.000239 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.76 on 172 degrees of freedom
## Residual deviance: 225.76 on 172 degrees of freedom
## AIC: 227.76
##
## Number of Fisher Scoring iterations: 4
Outras Estatísticas:
# -2 Log L
-2*logLik(M0)
## 'log Lik.' 225.7585 (df=1)
Ajuste do modelo:
M1 <- glm(data=crabs, y ~ width, family=binomial(link="logit"))
summary(M1)
##
## Call:
## glm(formula = y ~ width, family = binomial(link = "logit"), data = crabs)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0281 -1.0458 0.5480 0.9066 1.6942
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -12.3508 2.6287 -4.698 2.62e-06 ***
## width 0.4972 0.1017 4.887 1.02e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.76 on 172 degrees of freedom
## Residual deviance: 194.45 on 171 degrees of freedom
## AIC: 198.45
##
## Number of Fisher Scoring iterations: 4
-2logLik:
# -2logLik
-2*logLik(M1)
## 'log Lik.' 194.4527 (df=2)
Int. Conf. OR:
# OR
exp(confint.default(M1)) # o metodo *.default* assume normalidade assintotica, por isso eh utilizado.
## 2.5 % 97.5 %
## (Intercept) 2.503452e-08 0.0007476128
## width 1.346936e+00 2.0069749360
Teste de Razão de Verossimilhanças:
# 1) razao de verossimilhancas
#library(lmtest)
lmtest::lrtest(M1)
Teste de Wald:
# 2) teste de Wald
lmtest::waldtest(M1)
Teste Score:
# 3) Score
anova(M1,test="Rao")
AGRESTI - INTRODUCTION TO CATEGORIACAL DATATA ANALYSIS 2ND - PG.118
Ajuste do modelo:
M2.1 <- glm(data=crabs, y ~ width + color, family=binomial(link="logit"))
summary(M2.1)
##
## Call:
## glm(formula = y ~ width + color, family = binomial(link = "logit"),
## data = crabs)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1692 -0.9889 0.5429 0.8700 1.9742
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.0708 2.8068 -3.588 0.000333 ***
## width 0.4583 0.1040 4.406 1.05e-05 ***
## color -0.5090 0.2237 -2.276 0.022860 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.76 on 172 degrees of freedom
## Residual deviance: 189.12 on 170 degrees of freedom
## AIC: 195.12
##
## Number of Fisher Scoring iterations: 4
-2logLik:
# -2logLik
-2*logLik(M2.1)
## 'log Lik.' 189.1212 (df=3)
Int. Conf. OR:
# OR
exp(confint.default(M2.1)) # o metodo *.default* assume normalidade assintotica, por isso eh utilizado.
## 2.5 % 97.5 %
## (Intercept) 1.726271e-07 0.01036267
## width 1.289737e+00 1.93901774
## color 3.877266e-01 0.93179864
Teste de Razão de Verossimilhanças:
# 1) razao de verossimilhancas
#library(lmtest)
lmtest::lrtest(M2.1)
Teste de Wald:
# 2) teste de Wald
lmtest::waldtest(M2.1)
Teste Score:
# 3) Score
anova(M2.1,test="Rao")
Curvas Logísticas (segundo COLOR)
Apenas para os valores existentes de width:
plotly_pallette <- c('#1F77B4', '#FF7F0E', '#2CA02C', '#D62728')
p1 <- crabs %>%
dplyr::mutate(color = factor(color, labels = c("Lt Med - 1", "Medium - 2", "Dk Med - 3", "Dark - 4"))) %>%
dplyr::mutate(fitted.values = M2.1$fitted.values) %>%
ggplot()+
geom_point(aes(x=width, y=fitted.values, colour = factor(color)))+
labs(x="WIDTH", y="Probabilities")+
ggtitle("Valores Preditos - COLOR(ORDINAL)")+
theme_bw()+
theme(panel.border = element_blank())+ # para ficar igual o plotly
guides(color=guide_legend(title=NULL))+
scale_color_manual(values=plotly_pallette)+
theme(plot.title = element_text(hjust=0.5))
plotly::ggplotly(p1)
Utilizando vários valores de width:
Ajustes pré-plotagem:
# Create a temporary data frame of hypothetical values
temp.data <- data.frame(width = seq(20, 34, 0.11),
color = c(rep(1, 128),
rep(2, 128),
rep(3, 128),
rep(4, 128)) ) # o R vai reciclar o tamanho da coluna width para 4x 128
# Predict the fitted values given the model and hypothetical data
predicted.data <- as.data.frame(predict(M2.1, newdata = temp.data,
type="link", se=TRUE))
# Combine the hypothetical data and predicted values
new.data <- cbind(temp.data, predicted.data)
# Calculate confidence intervals
std <- qnorm(0.95 / 2 + 0.5)
new.data$ymin <- M2.1$family$linkinv(new.data$fit - std * new.data$se)
new.data$ymax <- M2.1$family$linkinv(new.data$fit + std * new.data$se)
new.data$fit <- M2.1$family$linkinv(new.data$fit) # Rescale to 0-1
Plot Completo:
new.data <- new.data %>%
dplyr::mutate(color = factor(color, labels = c("Lt Med - 1", "Medium - 2", "Dk Med - 3", "Dark - 4")))
plotly_pallette <- c('#1F77B4', '#FF7F0E', '#2CA02C', '#D62728')
plotly_pallette2 <- c('#FF7F0E', '#1F77B4','#2CA02C', '#D62728') # invertendo cor1 e cor2 para a legenda funcionar
# Plot everything
p2 <- crabs %>%
dplyr::mutate(color = factor(color, labels = c("Lt Med - 1", "Medium - 2", "Dk Med - 3", "Dark - 4"))) %>%
ggplot(aes(x=width, y=y))+
geom_line(data=new.data, aes(y=fit, colour=factor(color))) +
geom_point(data=crabs, aes(y=y, x=width, colour=factor(color)))+
labs(x="WIDTH", y="Probability")+
ggtitle("Curvas Logísticas Ajustadas - COLOR(ORDINAL)")+
guides(color=guide_legend(title=NULL))+
theme_bw()+
theme(panel.border = element_blank())+ # para ficar igual o plotly
scale_color_manual(values=c(plotly_pallette, rev(plotly_pallette2)))+ #truque para as legendas funcionarem no ggplotly
theme(plot.title = element_text(hjust=0.5))
plotly::ggplotly(p2)
# p2 <- crabs %>%
# dplyr::mutate(color = factor(color, labels = c("Lt Med - 1", "Medium - 2", "Dk Med - 3", "Dark - 4"))) %>%
# ggplot(aes(x=width, y=y))+
# geom_line(data=new.data, aes(y=fit, colour=factor(color))) +
# geom_point(data=crabs, aes(y=y, x=width, colour=factor(color)))+
# labs(x="WIDTH", y="Probability")+
# ggtitle("Curvas Logísticas Ajustadas - COLOR(ORDINAL)")+
# guides(color=guide_legend(title=NULL))+
# theme_bw()+
# theme(panel.border = element_blank())+ # para ficar igual o plotly
# # scale_color_manual(values=rep(c(plotly_pallette),2))+ #truque para as legendas funcionarem no ggplotly
# theme(plot.title = element_text(hjust=0.5))
#
#
# plotly::ggplotly(p2)
AGRESTI - INTRODUCTION TO CDA 2ND - PG.116
Preparação dos dados:
# soh precisamos tratar color como fator -- para a termos como variavel nominal:
crabs2 <- crabs %>%
dplyr::mutate(color = factor(color, labels = c("Lt Med", "Medium", "Dk Med", "Dark")))
crabs2
Ajuste do modelo:
# quando criamos um fator, o R usa como referencia o level de numero mais baixoquando vai criar dummies a
#... partir desse fator
# por isso eh necessario redefinir o level base com relevel()
# M2.2 <- glm(data=crabs2, y ~ width + relevel(factor(color), ref=4), family=binomial(link="logit"))
#ou:
M2.2 <- glm(data=crabs2, y ~ width + relevel(factor(color), ref='Dark'), family=binomial(link="logit"))
summary(M2.2)
##
## Call:
## glm(formula = y ~ width + relevel(factor(color), ref = "Dark"),
## family = binomial(link = "logit"), data = crabs2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1124 -0.9848 0.5243 0.8513 2.1413
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -12.7151 2.7617 -4.604
## width 0.4680 0.1055 4.434
## relevel(factor(color), ref = "Dark")Lt Med 1.3299 0.8525 1.560
## relevel(factor(color), ref = "Dark")Medium 1.4023 0.5484 2.557
## relevel(factor(color), ref = "Dark")Dk Med 1.1061 0.5921 1.868
## Pr(>|z|)
## (Intercept) 4.14e-06 ***
## width 9.26e-06 ***
## relevel(factor(color), ref = "Dark")Lt Med 0.1188
## relevel(factor(color), ref = "Dark")Medium 0.0106 *
## relevel(factor(color), ref = "Dark")Dk Med 0.0617 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.76 on 172 degrees of freedom
## Residual deviance: 187.46 on 168 degrees of freedom
## AIC: 197.46
##
## Number of Fisher Scoring iterations: 4
-2logLik:
# -2logLik
-2*logLik(M2.2)
## 'log Lik.' 187.457 (df=5)
Int. Conf. OR:
# OR
exp(confint.default(M2.2)) # o metodo *.default* assume normalidade assintotica, por isso eh utilizado.
## 2.5 % 97.5 %
## (Intercept) 1.340018e-08 6.740367e-04
## width 1.298348e+00 1.963679e+00
## relevel(factor(color), ref = "Dark")Lt Med 7.110636e-01 2.010225e+01
## relevel(factor(color), ref = "Dark")Medium 1.387381e+00 1.190852e+01
## relevel(factor(color), ref = "Dark")Dk Med 9.471157e-01 9.646324e+00
Teste de Razão de Verossimilhanças:
# 1) razao de verossimilhancas
#library(lmtest)
lmtest::lrtest(M2.2)
Teste de Wald:
# 2) teste de Wald
lmtest::waldtest(M2.2)
Teste Score:
# 3) Score
anova(M2.2,test="Rao")
Curvas Logísticas com Intervalos de Confiança (segundo COLOR)
Ajustes pré-plotagem:
# Create a temporary data frame of hypothetical values
temp.data <- data.frame(width = seq(20, 34, 0.11),
color = c(rep("Lt Med", 128),
rep("Medium", 128),
rep("Dk Med", 128),
rep("Dark", 128)) ) # o R vai reciclar o tamanho da coluna width para 4x 128
# Predict the fitted values given the model and hypothetical data
predicted.data <- as.data.frame(predict(M2.2, newdata = temp.data,
type="link", se=TRUE))
# Combine the hypothetical data and predicted values
new.data <- cbind(temp.data, predicted.data)
# Calculate confidence intervals
std <- qnorm(0.95 / 2 + 0.5)
new.data$ymin <- M2.2$family$linkinv(new.data$fit - std * new.data$se)
new.data$ymax <- M2.2$family$linkinv(new.data$fit + std * new.data$se)
new.data$fit <- M2.2$family$linkinv(new.data$fit) # Rescale to 0-1
Plot:
plotly_pallette <- c('#1F77B4', '#FF7F0E', '#2CA02C', '#D62728')
# Plot everything
p3 <- ggplot(crabs2, aes(x=width, y=y))+
geom_point() +
geom_ribbon(data=new.data,
aes(y=fit,
ymin=ymin, ymax=ymax,
fill=factor(color,
levels= c("Lt Med", "Medium", "Dk Med", "Dark"))), alpha=0.3) + # color jah estava como factor
geom_line(data=new.data, aes(y=fit, colour=factor(color, levels= c("Lt Med", "Medium", "Dk Med", "Dark")))) +
facet_wrap(~factor(color, levels= c("Lt Med", "Medium", "Dk Med", "Dark")))+
labs(x="WIDTH", y="Probability")+
ggtitle("Predicted Probabilities for Y=1 With 95% Confidence Limits")+
# theme_bw()+
# labs(color='color', fill='color') + # especificar os dois se quiser alterar
guides(fill=guide_legend(title=NULL), color=guide_legend(title=NULL))+ # hah duas legendas sobrepostas -- uma que vem de fill=... e outra de color=... --- por isso temos que especificar ambos os items para eliminar o titulo das duas
scale_fill_manual(values=plotly_pallette)+
scale_color_manual(values=plotly_pallette)+
theme(plot.title = element_text(hjust=0.5))
plotly::ggplotly(p3)
Todas as Curvas no mesmo gráfico:
plotly_pallette <- c('#1F77B4', '#FF7F0E', '#2CA02C', '#D62728')
# Plot everything
p4 <- ggplot(crabs2, aes(x=width, y=y))+
geom_line(data=new.data, aes(y=fit, colour=factor(color, levels= c("Lt Med", "Medium", "Dk Med", "Dark")))) +
geom_point(data=crabs2, aes(y=y, x=width, color=factor(color, levels= c("Lt Med", "Medium", "Dk Med", "Dark"))))+
labs(x="WIDTH", y="Probability")+
ggtitle("Predicted Probabilities for Y=1")+
theme_bw()+
theme(panel.border = element_blank())+ # para ficar igual o plotly
guides(color=guide_legend(title=NULL))+
scale_color_manual(values=plotly_pallette)+
theme(plot.title = element_text(hjust=0.5))
plotly::ggplotly(p4)
plotly_pallette <- c('#1F77B4', '#FF7F0E', '#2CA02C', '#D62728')
# Plot everything
p5 <- ggplot(crabs2, aes(x=width, y=y))+
geom_line(data=new.data, aes(y=fit, colour=factor(color, levels= c("Lt Med", "Medium", "Dk Med", "Dark"))))+
labs(x="WIDTH", y="Probability")+
ggtitle("COLOR(NOMINAL)")+
theme_bw()+
theme(panel.border = element_blank())+ # para ficar igual o plotly
guides(color=guide_legend(title=NULL))+
scale_color_manual(values=plotly_pallette)+
theme(plot.title = element_text(hjust=0.5))
plotly::ggplotly(p5)
Resíduos vs índices
model.data <- broom::augment(M2.2)
plotly_pallette3 <- c('#1F77B4', '#D62728')
p6 <- model.data %>%
ggplot() +
geom_point(aes(x = 1:length(.resid), y = .resid, colour = factor(y)), size=0.9, alpha=0.7)+
geom_hline(yintercept = 0, size=0.4, alpha=0.5)+
scale_color_manual(values=plotly_pallette3)+
guides(color=guide_legend(title=NULL))+ # para tirar o titulo da legenda
theme(plot.title = element_text(hjust=0.5))+
labs(title="Pearson Residuals", x="index", y="Pearson Residuals")
p7 <- model.data %>%
ggplot() +
geom_point(aes(x = 1:length(.resid), y = M2.2$residuals, colour = factor(y)), size=0.9, alpha=0.7)+
geom_hline(yintercept = 0, size=0.4, alpha=0.5)+
guides(color=guide_legend(title=NULL))+ # para tirar o titulo da legenda
scale_color_manual(values=plotly_pallette3)+
theme(plot.title = element_text(hjust=0.5))+
labs(title="Deviance Residuals", x="index", y="Pearson Residuals")
# library(gridExtra)
# pp <- grid.arrange(p4, p5, ncol=2)
# plotly::ggplotly(plotly::subplot(p4, p5))
# os subplots do plotly nao conseguem carregar os titulos separadamente
plotly::ggplotly(p6)
plotly::ggplotly(p7)
Preparação para as demais medidas de influência:
medinflu <-influence.measures(M2.2)
inf_measures <- tibble::as.tibble(medinflu$infmat) %>%
magrittr::set_colnames(c("dfb.int", "dfb.width", "dfb.LtMed", "dfb.Medium", "dfb.DkMd", "dffit", "cov.r", "cook.d", "hat")) %>%
dplyr::select(cook.d, hat, dfb.int, dfb.width, dfb.LtMed, dfb.Medium, dfb.DkMd)
inf_measures
Demais gráficos:
inf_measures2 <- inf_measures %>%
reshape2::melt() #%>% # passando para o formato long para fazermos um facet_wrap
## No id variables; using all as measure variables
inf_measures2 <- cbind(inf_measures2, model.data[,"y", drop=FALSE], x=1:nrow(model.data))
# primeiro cook.d e leverage
p8 <- inf_measures2 %>%
dplyr::filter(stringr::str_detect(variable, "cook.d|hat")) %>%
ggplot() +
geom_point(aes(x = x, y = value, colour = factor(y)), size=0.9, alpha=0.7)+
geom_hline(yintercept = 0, size=0.4, alpha=0.5)+
guides(color=guide_legend(title=NULL))+ # para tirar o titulo da legenda
facet_wrap(~variable)+
scale_color_manual(values=plotly_pallette3)+
theme(plot.title = element_text(hjust=0.5))+
labs(title="Influence Diagnostics", x="index", y="value")
plotly::ggplotly(p8)
# depois dfbetas sem intercepto
p9 <- inf_measures2 %>%
dplyr::filter(!stringr::str_detect(variable, "cook.d|hat|dfb.int")) %>%
ggplot() +
geom_point(aes(x = x, y = value, colour = factor(y)), size=0.9, alpha=0.7)+
geom_hline(yintercept = 0, size=0.4, alpha=0.5)+
guides(color=guide_legend(title=NULL))+ # para tirar o titulo da legenda
facet_wrap(~variable)+
scale_color_manual(values=plotly_pallette3)+
theme(plot.title = element_text(hjust=0.5))+
labs(title="Influence Diagnostics - DfBetas", x="index", y="value")
plotly::ggplotly(p9)
Preparação dos dados:
# soh precisamos tratar color como fator:
# ficando apenas com as variaveis de interesse y, width, color
crabs3 <- crabs2 %>%
dplyr::select(y, width, color)
head(crabs3)
Seleção Automática de Modelos:
Backward a partir do modelo completo y \(\sim\) width + Lt Med + Medium + Dk Med:
# ajustando os dados
# precisamos fazer isso, para que a funcao step reconheça as variaveis dummy
# step(M2.2) # embora glm reconheça as dummy, a funcao step trata como uma variavel color soh com 3 g.l.
df <- as.data.frame(crabs3)
str(df)
## 'data.frame': 173 obs. of 3 variables:
## $ y : num 1 0 0 0 1 1 0 1 1 1 ...
## $ width: num 28.3 24.8 26.5 25.6 28.2 27.1 24.7 25 25.8 26.8 ...
## $ color: Factor w/ 4 levels "Lt Med","Medium",..: 2 3 1 3 2 1 4 1 4 2 ...
m <- model.matrix(~color-1, data=crabs3)
df <- cbind(crabs3[,-3], m[,-4]) # tirando Dark
colnames(df) <- c("y", "width", "Lt_Med", "Medium", "Dk_Med")
tibble::as.tibble(df)
# modelo completo:
fullmod <- glm(data=df, y ~ ., family=binomial(link="logit"))
# olhar para os AIC's mais baixos
step(fullmod) # esse jah eh o melhor no backward
## Start: AIC=197.46
## y ~ width + Lt_Med + Medium + Dk_Med
##
## Df Deviance AIC
## <none> 187.46 197.46
## - Lt_Med 1 190.07 198.07
## - Dk_Med 1 191.11 199.11
## - Medium 1 194.37 202.37
## - width 1 212.06 220.06
##
## Call: glm(formula = y ~ width + Lt_Med + Medium + Dk_Med, family = binomial(link = "logit"),
## data = df)
##
## Coefficients:
## (Intercept) width Lt_Med Medium Dk_Med
## -12.715 0.468 1.330 1.402 1.106
##
## Degrees of Freedom: 172 Total (i.e. Null); 168 Residual
## Null Deviance: 225.8
## Residual Deviance: 187.5 AIC: 197.5
Forward a partir do modelo simples:
modmin <- glm(data=df, y~1, family=binomial(link="logit"))
step(modmin, direction = "forward", scope = ~ width + Lt_Med + Medium + Dk_Med) #ok!
## Start: AIC=227.76
## y ~ 1
##
## Df Deviance AIC
## + width 1 194.45 198.45
## + Medium 1 219.18 223.18
## <none> 225.76 227.76
## + Lt_Med 1 225.06 229.06
## + Dk_Med 1 225.11 229.11
##
## Step: AIC=198.45
## y ~ width
##
## Df Deviance AIC
## + Medium 1 191.76 197.76
## <none> 194.45 198.45
## + Lt_Med 1 194.37 200.37
## + Dk_Med 1 194.45 200.45
##
## Step: AIC=197.76
## y ~ width + Medium
##
## Df Deviance AIC
## <none> 191.76 197.76
## + Dk_Med 1 190.07 198.07
## + Lt_Med 1 191.11 199.11
##
## Call: glm(formula = y ~ width + Medium, family = binomial(link = "logit"),
## data = df)
##
## Coefficients:
## (Intercept) width Medium
## -12.1827 0.4793 0.5763
##
## Degrees of Freedom: 172 Total (i.e. Null); 170 Residual
## Null Deviance: 225.8
## Residual Deviance: 191.8 AIC: 197.8
Curva ROC para os modelo selecionados:
library(pROC)
#https://stackoverflow.com/questions/27584099/plot-multiple-roc-curves-for-logistic-regression-model-in-r
y <- crabs2[["y"]]
# modelo 2.2 (resultante do backward)
preds1 <- predict(M2.2)
roc1=roc(y ~ preds1)
# modelo (resultante do forward)
mod_fwd <- glm(data=df, y ~ width + Medium, family=binomial(link="logit"))
preds2 <- predict(mod_fwd)
roc2=roc(y ~ preds2)
p10 <- ggplot()+
geom_point(aes(x=(1-roc1$specificities), y=roc1$sensitivities), colour = plotly_pallette[1])+
geom_point(aes(x=(1-roc2$specificities), y=roc2$sensitivities), colour = plotly_pallette[3])+
scale_x_continuous(limits = c(0,1))+
# geom_abline(intercept = 0, slope = 1, colour = plotly_pallette[2])+
geom_line(aes(x=c(0,1), y=c(0,1)), colour = plotly_pallette[2])+
labs(x="1-especificidade", y="sensibilidade")+
ggtitle("Curvas ROC")+
theme_bw()+
theme(panel.border = element_blank())+ # para ficar igual o plotly
theme(plot.title = element_text(hjust=0.5))
plotly::ggplotly(p10)
Ajuste do modelo:
# utilizando como base a cor 'Dark' novamente
M3 <- glm(data=crabs2, y ~ width*relevel(factor(color), ref='Dark'), family=binomial(link="logit"))
summary(M3)
##
## Call:
## glm(formula = y ~ width * relevel(factor(color), ref = "Dark"),
## family = binomial(link = "logit"), data = crabs2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0546 -0.9130 0.5285 0.8140 1.9657
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -5.85383 6.69393
## width 0.20043 0.26166
## relevel(factor(color), ref = "Dark")Lt Med 4.10122 13.27532
## relevel(factor(color), ref = "Dark")Medium -4.18613 7.58093
## relevel(factor(color), ref = "Dark")Dk Med -15.66423 9.56065
## width:relevel(factor(color), ref = "Dark")Lt Med -0.09443 0.50042
## width:relevel(factor(color), ref = "Dark")Medium 0.21844 0.29524
## width:relevel(factor(color), ref = "Dark")Dk Med 0.65794 0.37533
## z value Pr(>|z|)
## (Intercept) -0.874 0.3818
## width 0.766 0.4437
## relevel(factor(color), ref = "Dark")Lt Med 0.309 0.7574
## relevel(factor(color), ref = "Dark")Medium -0.552 0.5808
## relevel(factor(color), ref = "Dark")Dk Med -1.638 0.1013
## width:relevel(factor(color), ref = "Dark")Lt Med -0.189 0.8503
## width:relevel(factor(color), ref = "Dark")Medium 0.740 0.4594
## width:relevel(factor(color), ref = "Dark")Dk Med 1.753 0.0796 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.76 on 172 degrees of freedom
## Residual deviance: 183.08 on 165 degrees of freedom
## AIC: 199.08
##
## Number of Fisher Scoring iterations: 5
-2logLik:
# -2logLik
-2*logLik(M3)
## 'log Lik.' 183.0806 (df=8)
Int. Conf. OR:
# OR
exp(confint.default(M3)) # o metodo *.default* assume normalidade assintotica, por isso eh utilizado.
## 2.5 % 97.5 %
## (Intercept) 5.752129e-09 1.430867e+03
## width 7.316783e-01 2.040661e+00
## relevel(factor(color), ref = "Dark")Lt Med 3.028073e-10 1.205339e+13
## relevel(factor(color), ref = "Dark")Medium 5.359023e-09 4.314052e+04
## relevel(factor(color), ref = "Dark")Dk Med 1.145703e-15 2.163458e+01
## width:relevel(factor(color), ref = "Dark")Lt Med 3.412170e-01 2.426326e+00
## width:relevel(factor(color), ref = "Dark")Medium 6.975291e-01 2.219082e+00
## width:relevel(factor(color), ref = "Dark")Dk Med 9.252405e-01 4.029248e+00
Teste de Razão de Verossimilhanças:
# 1) razao de verossimilhancas
#library(lmtest)
lmtest::lrtest(M3)
Teste de Wald:
# 2) teste de Wald
lmtest::waldtest(M3)
Teste Score:
# 3) Score
anova(M3,test="Rao")
AGRESTI, ALAN. An Introduction to Categorical Data Analysis. John Wiley & Sons, second edition, 2007.
NOTAS DE AULA. Análise de Dados Categorizados. Curso de Graduação em Estatística, UnB, 2018.
R CORE TEAM. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.