Horseshoe Crab 01 e 02

Carregamento de pacotes:

Neste exemplo são utilizados os seguintes pacotes: readr, dplyr, magrittr, tibble, broom, reshape2, ggplot2, plotly, lmtest e pROC. Para a confecção do documento, é necessário o pacote rmarkdown.

# readr -- para leitura de dados
if (!require('readr')) install.packages('readr'); library('readr')

# dplyr -- para facilitar manipulacao de dados
if (!require('dplyr')) install.packages('dplyr'); library('dplyr')

# magrittr -- para facilitar manipulacao de dados
if (!require('magrittr')) install.packages('magrittr'); library('magrittr')

# tibble -- para facilitar manipulacao de dados
if (!require('tibble')) install.packages('tibble'); library('tibble')

# reshape2 -- para facilitar manipulacao de dados
if (!require('reshape2')) install.packages('reshape2'); library('reshape2')

# broom -- para facilitar manipulacao de dados
if (!require('broom')) install.packages('broom'); library('broom')

# ggplot2 -- para os plots
if (!require('ggplot2')) install.packages('ggplot2'); library('ggplot2')

# plotly -- para os plots
if (!require('plotly')) install.packages('plotly'); library('plotly')

# # gridExtra -- para os plots
# if (!require('gridExtra')) install.packages('gridExtra'); library('gridExtra')

# lmtest -- para os testes de razao de verossimilhancas e para o teste de wald
if (!require('lmtest')) install.packages('lmtest'); library('lmtest')

# pROC -- para a curva ROC
if (!require('pROC')) install.packages('pROC'); library('pROC')

Carregamento dos dados:

Os dados utilizados neste exemplo são importados diretamente do repositório no GitHub e podem ser acessados no link abaixo:

Dataset Crabs.txt

#options(scipen=999)
crabs <- readr::read_table2("https://raw.githubusercontent.com/allanvc/categorical_data_analysis/master/exemplos/crabs/crabs.txt", col_names = FALSE)

crabs <- crabs %>%
  magrittr::set_colnames(c("color", "spine", "width", "satell", "weight")) %>%
  na.omit() %>%
  dplyr::mutate(y = dplyr::if_else(satell > 0, 1, 0), # resposta
                weight = weight/1000, # transformando peso em kg
                color = color - 1) # para escala de color de 1 a 4 


crabs

Frequência para as variáveis categóricas:

tab_freq <- sapply(crabs[, c("color", "spine", "satell", "y")], table)

tab_freq

## $color
## 
##  1  2  3  4 
## 12 95 44 22 
## 
## $spine
## 
##   1   2   3 
##  37  15 121 
## 
## $satell
## 
##  0  1  2  3  4  5  6  7  8  9 10 11 12 14 15 
## 62 16  9 19 19 15 13  4  6  3  3  1  1  1  1 
## 
## $y
## 
##   0   1 
##  62 111

MO: MODELO LOGÍSTICO SEM EFEITO (SOMENTE INTERCEPTO)

Ajuste do modelo:

M0 <- glm(data=crabs, y ~ 1, family=binomial(link="logit"))

summary(M0)

## 
## Call:
## glm(formula = y ~ 1, family = binomial(link = "logit"), data = crabs)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4326  -1.4326   0.9421   0.9421   0.9421  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.5824     0.1585   3.673 0.000239 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.76  on 172  degrees of freedom
## Residual deviance: 225.76  on 172  degrees of freedom
## AIC: 227.76
## 
## Number of Fisher Scoring iterations: 4

Outras Estatísticas:

# -2 Log L
-2*logLik(M0)

## 'log Lik.' 225.7585 (df=1)

M1: MODELO LOGÍSTICO SOMENTE COM WIDTH

Ajuste do modelo:

M1 <- glm(data=crabs, y ~ width, family=binomial(link="logit"))

summary(M1)

## 
## Call:
## glm(formula = y ~ width, family = binomial(link = "logit"), data = crabs)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0281  -1.0458   0.5480   0.9066   1.6942  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -12.3508     2.6287  -4.698 2.62e-06 ***
## width         0.4972     0.1017   4.887 1.02e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.76  on 172  degrees of freedom
## Residual deviance: 194.45  on 171  degrees of freedom
## AIC: 198.45
## 
## Number of Fisher Scoring iterations: 4

-2logLik:

# -2logLik
-2*logLik(M1)

## 'log Lik.' 194.4527 (df=2)

Int. Conf. OR:

# OR
exp(confint.default(M1)) # o metodo *.default* assume normalidade assintotica, por isso eh utilizado.

##                    2.5 %       97.5 %
## (Intercept) 2.503452e-08 0.0007476128
## width       1.346936e+00 2.0069749360

Teste de Razão de Verossimilhanças:

# 1) razao de verossimilhancas
#library(lmtest)
lmtest::lrtest(M1)

Teste de Wald:

# 2) teste de Wald
lmtest::waldtest(M1)

Teste Score:

# 3) Score
anova(M1,test="Rao")

M2.1: MODELO LOGÍSTICO COM WIDTH E COLOR (ORDINAL)

AGRESTI - INTRODUCTION TO CATEGORIACAL DATATA ANALYSIS 2ND - PG.118

Ajuste do modelo:

M2.1 <- glm(data=crabs, y ~ width + color, family=binomial(link="logit"))

summary(M2.1)

## 
## Call:
## glm(formula = y ~ width + color, family = binomial(link = "logit"), 
##     data = crabs)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1692  -0.9889   0.5429   0.8700   1.9742  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -10.0708     2.8068  -3.588 0.000333 ***
## width         0.4583     0.1040   4.406 1.05e-05 ***
## color        -0.5090     0.2237  -2.276 0.022860 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.76  on 172  degrees of freedom
## Residual deviance: 189.12  on 170  degrees of freedom
## AIC: 195.12
## 
## Number of Fisher Scoring iterations: 4

-2logLik:

# -2logLik
-2*logLik(M2.1)

## 'log Lik.' 189.1212 (df=3)

Int. Conf. OR:

# OR
exp(confint.default(M2.1)) # o metodo *.default* assume normalidade assintotica, por isso eh utilizado.

##                    2.5 %     97.5 %
## (Intercept) 1.726271e-07 0.01036267
## width       1.289737e+00 1.93901774
## color       3.877266e-01 0.93179864

Teste de Razão de Verossimilhanças:

# 1) razao de verossimilhancas
#library(lmtest)
lmtest::lrtest(M2.1)

Teste de Wald:

# 2) teste de Wald
lmtest::waldtest(M2.1)

Teste Score:

# 3) Score
anova(M2.1,test="Rao")

Curvas Logísticas (segundo COLOR)

Apenas para os valores existentes de width:

plotly_pallette <- c('#1F77B4', '#FF7F0E', '#2CA02C', '#D62728')

p1 <- crabs %>%
  dplyr::mutate(color = factor(color, labels = c("Lt Med - 1", "Medium - 2", "Dk Med - 3", "Dark - 4"))) %>%
  dplyr::mutate(fitted.values = M2.1$fitted.values) %>%
  ggplot()+
  geom_point(aes(x=width, y=fitted.values, colour = factor(color)))+
  labs(x="WIDTH", y="Probabilities")+
  ggtitle("Valores Preditos - COLOR(ORDINAL)")+
  theme_bw()+
  theme(panel.border = element_blank())+ # para ficar igual o plotly
  guides(color=guide_legend(title=NULL))+
  scale_color_manual(values=plotly_pallette)+
  theme(plot.title = element_text(hjust=0.5))

plotly::ggplotly(p1)

Utilizando vários valores de width:

Ajustes pré-plotagem:

# Create a temporary data frame of hypothetical values
temp.data <- data.frame(width = seq(20, 34, 0.11),
                        color = c(rep(1, 128),
                                     rep(2, 128), 
                                     rep(3, 128),
                                     rep(4, 128)) ) # o R vai reciclar o tamanho da coluna width para 4x 128


# Predict the fitted values given the model and hypothetical data
predicted.data <- as.data.frame(predict(M2.1, newdata = temp.data, 
                                        type="link", se=TRUE))

# Combine the hypothetical data and predicted values
new.data <- cbind(temp.data, predicted.data)

# Calculate confidence intervals
std <- qnorm(0.95 / 2 + 0.5)
new.data$ymin <- M2.1$family$linkinv(new.data$fit - std * new.data$se)
new.data$ymax <- M2.1$family$linkinv(new.data$fit + std * new.data$se)
new.data$fit <- M2.1$family$linkinv(new.data$fit)  # Rescale to 0-1

Plot Completo:

new.data <- new.data %>%
  dplyr::mutate(color = factor(color, labels = c("Lt Med - 1", "Medium - 2", "Dk Med - 3", "Dark - 4")))


plotly_pallette <- c('#1F77B4', '#FF7F0E', '#2CA02C', '#D62728')
plotly_pallette2 <- c('#FF7F0E', '#1F77B4','#2CA02C', '#D62728') # invertendo cor1 e cor2 para a legenda funcionar

# Plot everything
p2 <- crabs %>%
  dplyr::mutate(color = factor(color, labels = c("Lt Med - 1", "Medium - 2", "Dk Med - 3", "Dark - 4"))) %>%
  ggplot(aes(x=width, y=y))+
  geom_line(data=new.data, aes(y=fit, colour=factor(color))) + 
  geom_point(data=crabs, aes(y=y, x=width, colour=factor(color)))+
  labs(x="WIDTH", y="Probability")+
  ggtitle("Curvas Logísticas Ajustadas - COLOR(ORDINAL)")+
  guides(color=guide_legend(title=NULL))+
  theme_bw()+
  theme(panel.border = element_blank())+ # para ficar igual o plotly
  scale_color_manual(values=c(plotly_pallette, rev(plotly_pallette2)))+ #truque para as legendas funcionarem no ggplotly
  theme(plot.title = element_text(hjust=0.5))
  

plotly::ggplotly(p2)

# p2 <- crabs %>%
#   dplyr::mutate(color = factor(color, labels = c("Lt Med - 1", "Medium - 2", "Dk Med - 3", "Dark - 4"))) %>%
#   ggplot(aes(x=width, y=y))+
#   geom_line(data=new.data, aes(y=fit, colour=factor(color))) + 
#   geom_point(data=crabs, aes(y=y, x=width, colour=factor(color)))+
#   labs(x="WIDTH", y="Probability")+
#   ggtitle("Curvas Logísticas Ajustadas - COLOR(ORDINAL)")+
#   guides(color=guide_legend(title=NULL))+
#   theme_bw()+
#   theme(panel.border = element_blank())+ # para ficar igual o plotly
#   # scale_color_manual(values=rep(c(plotly_pallette),2))+ #truque para as legendas funcionarem no ggplotly
#   theme(plot.title = element_text(hjust=0.5))
#   
# 
# plotly::ggplotly(p2)

M2.2: MODELO LOGÍSTICO COM WIDTH E COLOR (NOMINAL)

AGRESTI - INTRODUCTION TO CDA 2ND - PG.116

Preparação dos dados:

# soh precisamos tratar color como fator -- para a termos como variavel nominal:
crabs2 <- crabs %>%
  dplyr::mutate(color = factor(color, labels = c("Lt Med", "Medium", "Dk Med", "Dark")))

crabs2

Ajuste do modelo:

# quando criamos um fator, o R usa como referencia o level de numero mais baixoquando vai criar dummies a 
#... partir desse fator
# por isso eh necessario redefinir o level base com relevel()

# M2.2 <- glm(data=crabs2, y ~ width + relevel(factor(color), ref=4), family=binomial(link="logit"))
#ou:
M2.2 <- glm(data=crabs2, y ~ width + relevel(factor(color), ref='Dark'), family=binomial(link="logit"))

summary(M2.2)

## 
## Call:
## glm(formula = y ~ width + relevel(factor(color), ref = "Dark"), 
##     family = binomial(link = "logit"), data = crabs2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1124  -0.9848   0.5243   0.8513   2.1413  
## 
## Coefficients:
##                                            Estimate Std. Error z value
## (Intercept)                                -12.7151     2.7617  -4.604
## width                                        0.4680     0.1055   4.434
## relevel(factor(color), ref = "Dark")Lt Med   1.3299     0.8525   1.560
## relevel(factor(color), ref = "Dark")Medium   1.4023     0.5484   2.557
## relevel(factor(color), ref = "Dark")Dk Med   1.1061     0.5921   1.868
##                                            Pr(>|z|)    
## (Intercept)                                4.14e-06 ***
## width                                      9.26e-06 ***
## relevel(factor(color), ref = "Dark")Lt Med   0.1188    
## relevel(factor(color), ref = "Dark")Medium   0.0106 *  
## relevel(factor(color), ref = "Dark")Dk Med   0.0617 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.76  on 172  degrees of freedom
## Residual deviance: 187.46  on 168  degrees of freedom
## AIC: 197.46
## 
## Number of Fisher Scoring iterations: 4

-2logLik:

# -2logLik
-2*logLik(M2.2)

## 'log Lik.' 187.457 (df=5)

Int. Conf. OR:

# OR
exp(confint.default(M2.2)) # o metodo *.default* assume normalidade assintotica, por isso eh utilizado.

##                                                   2.5 %       97.5 %
## (Intercept)                                1.340018e-08 6.740367e-04
## width                                      1.298348e+00 1.963679e+00
## relevel(factor(color), ref = "Dark")Lt Med 7.110636e-01 2.010225e+01
## relevel(factor(color), ref = "Dark")Medium 1.387381e+00 1.190852e+01
## relevel(factor(color), ref = "Dark")Dk Med 9.471157e-01 9.646324e+00

Teste de Razão de Verossimilhanças:

# 1) razao de verossimilhancas
#library(lmtest)
lmtest::lrtest(M2.2)

Teste de Wald:

# 2) teste de Wald
lmtest::waldtest(M2.2)

Teste Score:

# 3) Score
anova(M2.2,test="Rao")

Curvas Logísticas com Intervalos de Confiança (segundo COLOR)

Ajustes pré-plotagem:

# Create a temporary data frame of hypothetical values
temp.data <- data.frame(width = seq(20, 34, 0.11),
                        color = c(rep("Lt Med", 128),
                                     rep("Medium", 128), 
                                     rep("Dk Med", 128),
                                     rep("Dark", 128)) ) # o R vai reciclar o tamanho da coluna width para 4x 128


# Predict the fitted values given the model and hypothetical data
predicted.data <- as.data.frame(predict(M2.2, newdata = temp.data, 
                                        type="link", se=TRUE))

# Combine the hypothetical data and predicted values
new.data <- cbind(temp.data, predicted.data)

# Calculate confidence intervals
std <- qnorm(0.95 / 2 + 0.5)
new.data$ymin <- M2.2$family$linkinv(new.data$fit - std * new.data$se)
new.data$ymax <- M2.2$family$linkinv(new.data$fit + std * new.data$se)
new.data$fit <- M2.2$family$linkinv(new.data$fit)  # Rescale to 0-1

Plot:

plotly_pallette <- c('#1F77B4', '#FF7F0E', '#2CA02C', '#D62728')

# Plot everything
p3 <- ggplot(crabs2, aes(x=width, y=y))+
  geom_point() + 
  geom_ribbon(data=new.data, 
              aes(y=fit, 
                  ymin=ymin, ymax=ymax, 
                  fill=factor(color, 
                              levels= c("Lt Med", "Medium", "Dk Med", "Dark"))), alpha=0.3) + # color jah estava como factor
  geom_line(data=new.data, aes(y=fit, colour=factor(color, levels= c("Lt Med", "Medium", "Dk Med", "Dark")))) + 
  facet_wrap(~factor(color, levels= c("Lt Med", "Medium", "Dk Med", "Dark")))+
  labs(x="WIDTH", y="Probability")+
  ggtitle("Predicted Probabilities for Y=1 With 95% Confidence Limits")+
  # theme_bw()+
  # labs(color='color', fill='color') + # especificar os dois se quiser alterar
  guides(fill=guide_legend(title=NULL), color=guide_legend(title=NULL))+ # hah duas legendas sobrepostas -- uma que vem de fill=... e outra de color=... --- por isso temos que especificar ambos os items para eliminar o titulo das duas
  scale_fill_manual(values=plotly_pallette)+
  scale_color_manual(values=plotly_pallette)+
  theme(plot.title = element_text(hjust=0.5))
  

plotly::ggplotly(p3)

Todas as Curvas no mesmo gráfico:

plotly_pallette <- c('#1F77B4', '#FF7F0E', '#2CA02C', '#D62728')

# Plot everything
p4 <- ggplot(crabs2, aes(x=width, y=y))+
  geom_line(data=new.data, aes(y=fit, colour=factor(color, levels= c("Lt Med", "Medium", "Dk Med", "Dark")))) + 
  geom_point(data=crabs2, aes(y=y, x=width, color=factor(color, levels= c("Lt Med", "Medium", "Dk Med", "Dark"))))+
  labs(x="WIDTH", y="Probability")+
  ggtitle("Predicted Probabilities for Y=1")+
  theme_bw()+
  theme(panel.border = element_blank())+ # para ficar igual o plotly
  guides(color=guide_legend(title=NULL))+
  scale_color_manual(values=plotly_pallette)+
  theme(plot.title = element_text(hjust=0.5))
  

plotly::ggplotly(p4)

plotly_pallette <- c('#1F77B4', '#FF7F0E', '#2CA02C', '#D62728')

# Plot everything
p5 <- ggplot(crabs2, aes(x=width, y=y))+
  geom_line(data=new.data, aes(y=fit, colour=factor(color, levels= c("Lt Med", "Medium", "Dk Med", "Dark"))))+ 
  labs(x="WIDTH", y="Probability")+
  ggtitle("COLOR(NOMINAL)")+
  theme_bw()+
  theme(panel.border = element_blank())+ # para ficar igual o plotly
  guides(color=guide_legend(title=NULL))+
  scale_color_manual(values=plotly_pallette)+
  theme(plot.title = element_text(hjust=0.5))
  

plotly::ggplotly(p5)

MEDIDAS E PLOTS DE DIAGNÓSTICO PARA O MODELO M2.2

Resíduos vs índices

model.data <- broom::augment(M2.2)

plotly_pallette3 <- c('#1F77B4', '#D62728')

p6 <- model.data %>%
  ggplot() +
  geom_point(aes(x = 1:length(.resid), y = .resid, colour = factor(y)), size=0.9, alpha=0.7)+
  geom_hline(yintercept = 0, size=0.4, alpha=0.5)+
  scale_color_manual(values=plotly_pallette3)+
  guides(color=guide_legend(title=NULL))+ # para tirar o titulo da legenda
  theme(plot.title = element_text(hjust=0.5))+
  labs(title="Pearson Residuals", x="index", y="Pearson Residuals")
  
  
  
p7 <- model.data %>%
  ggplot() +
  geom_point(aes(x = 1:length(.resid), y = M2.2$residuals, colour = factor(y)), size=0.9, alpha=0.7)+
  geom_hline(yintercept = 0, size=0.4, alpha=0.5)+
  guides(color=guide_legend(title=NULL))+ # para tirar o titulo da legenda
  scale_color_manual(values=plotly_pallette3)+
  theme(plot.title = element_text(hjust=0.5))+
  labs(title="Deviance Residuals", x="index", y="Pearson Residuals")

# library(gridExtra)
# pp <- grid.arrange(p4, p5, ncol=2)

# plotly::ggplotly(plotly::subplot(p4, p5)) 
# os subplots do plotly nao conseguem carregar os titulos separadamente

plotly::ggplotly(p6)

plotly::ggplotly(p7)

Preparação para as demais medidas de influência:

medinflu <-influence.measures(M2.2)

inf_measures <- tibble::as.tibble(medinflu$infmat) %>%
  magrittr::set_colnames(c("dfb.int", "dfb.width", "dfb.LtMed", "dfb.Medium", "dfb.DkMd", "dffit", "cov.r", "cook.d", "hat")) %>%
  dplyr::select(cook.d, hat, dfb.int, dfb.width, dfb.LtMed, dfb.Medium, dfb.DkMd)

inf_measures

Demais gráficos:

inf_measures2 <- inf_measures %>%
  reshape2::melt() #%>% # passando para o formato long para fazermos um facet_wrap

## No id variables; using all as measure variables

inf_measures2 <- cbind(inf_measures2, model.data[,"y", drop=FALSE], x=1:nrow(model.data))
  
# primeiro cook.d e leverage
p8 <- inf_measures2 %>%
  dplyr::filter(stringr::str_detect(variable, "cook.d|hat")) %>%
  ggplot() +
  geom_point(aes(x = x, y = value, colour = factor(y)), size=0.9, alpha=0.7)+
  geom_hline(yintercept = 0, size=0.4, alpha=0.5)+
  guides(color=guide_legend(title=NULL))+ # para tirar o titulo da legenda
  facet_wrap(~variable)+
  scale_color_manual(values=plotly_pallette3)+
  theme(plot.title = element_text(hjust=0.5))+
  labs(title="Influence Diagnostics", x="index", y="value")
  

plotly::ggplotly(p8)

# depois dfbetas sem intercepto
p9 <- inf_measures2 %>%
  dplyr::filter(!stringr::str_detect(variable, "cook.d|hat|dfb.int")) %>%
  ggplot() +
  geom_point(aes(x = x, y = value, colour = factor(y)), size=0.9, alpha=0.7)+
  geom_hline(yintercept = 0, size=0.4, alpha=0.5)+
  guides(color=guide_legend(title=NULL))+ # para tirar o titulo da legenda
  facet_wrap(~variable)+
  scale_color_manual(values=plotly_pallette3)+
  theme(plot.title = element_text(hjust=0.5))+
  labs(title="Influence Diagnostics - DfBetas", x="index", y="value")
  

plotly::ggplotly(p9)

EXTRA

SELEÇÃO DE VARIÁVEIS PARA O MODELO M2.2: MODELO LOGÍSTICO COM WIDTH E COLOR (NOMINAL)

Preparação dos dados:

# soh precisamos tratar color como fator:
# ficando apenas com as variaveis de interesse y, width, color
crabs3 <- crabs2 %>%
  dplyr::select(y, width, color)


head(crabs3)

Seleção Automática de Modelos:

Backward a partir do modelo completo y \(\sim\) width + Lt Med + Medium + Dk Med:

# ajustando os dados
# precisamos fazer isso, para que a funcao step reconheça as variaveis dummy
# step(M2.2) # embora glm reconheça as dummy, a funcao step trata como uma variavel color soh com 3 g.l.

df <- as.data.frame(crabs3)
str(df)

## 'data.frame':    173 obs. of  3 variables:
##  $ y    : num  1 0 0 0 1 1 0 1 1 1 ...
##  $ width: num  28.3 24.8 26.5 25.6 28.2 27.1 24.7 25 25.8 26.8 ...
##  $ color: Factor w/ 4 levels "Lt Med","Medium",..: 2 3 1 3 2 1 4 1 4 2 ...

m <- model.matrix(~color-1, data=crabs3)
df <- cbind(crabs3[,-3], m[,-4]) # tirando Dark
colnames(df) <- c("y", "width", "Lt_Med", "Medium", "Dk_Med")

tibble::as.tibble(df)

# modelo completo:
fullmod <- glm(data=df, y ~ ., family=binomial(link="logit"))

# olhar para os AIC's mais baixos

step(fullmod) # esse jah eh o melhor no backward

## Start:  AIC=197.46
## y ~ width + Lt_Med + Medium + Dk_Med
## 
##          Df Deviance    AIC
## <none>        187.46 197.46
## - Lt_Med  1   190.07 198.07
## - Dk_Med  1   191.11 199.11
## - Medium  1   194.37 202.37
## - width   1   212.06 220.06

## 
## Call:  glm(formula = y ~ width + Lt_Med + Medium + Dk_Med, family = binomial(link = "logit"), 
##     data = df)
## 
## Coefficients:
## (Intercept)        width       Lt_Med       Medium       Dk_Med  
##     -12.715        0.468        1.330        1.402        1.106  
## 
## Degrees of Freedom: 172 Total (i.e. Null);  168 Residual
## Null Deviance:       225.8 
## Residual Deviance: 187.5     AIC: 197.5

Forward a partir do modelo simples:

modmin <- glm(data=df, y~1, family=binomial(link="logit"))
step(modmin, direction = "forward", scope = ~ width + Lt_Med + Medium + Dk_Med) #ok!

## Start:  AIC=227.76
## y ~ 1
## 
##          Df Deviance    AIC
## + width   1   194.45 198.45
## + Medium  1   219.18 223.18
## <none>        225.76 227.76
## + Lt_Med  1   225.06 229.06
## + Dk_Med  1   225.11 229.11
## 
## Step:  AIC=198.45
## y ~ width
## 
##          Df Deviance    AIC
## + Medium  1   191.76 197.76
## <none>        194.45 198.45
## + Lt_Med  1   194.37 200.37
## + Dk_Med  1   194.45 200.45
## 
## Step:  AIC=197.76
## y ~ width + Medium
## 
##          Df Deviance    AIC
## <none>        191.76 197.76
## + Dk_Med  1   190.07 198.07
## + Lt_Med  1   191.11 199.11

## 
## Call:  glm(formula = y ~ width + Medium, family = binomial(link = "logit"), 
##     data = df)
## 
## Coefficients:
## (Intercept)        width       Medium  
##    -12.1827       0.4793       0.5763  
## 
## Degrees of Freedom: 172 Total (i.e. Null);  170 Residual
## Null Deviance:       225.8 
## Residual Deviance: 191.8     AIC: 197.8

Curva ROC para os modelo selecionados:

library(pROC)
#https://stackoverflow.com/questions/27584099/plot-multiple-roc-curves-for-logistic-regression-model-in-r

y <- crabs2[["y"]]

# modelo 2.2 (resultante do backward)
preds1 <- predict(M2.2)
roc1=roc(y ~ preds1)

# modelo (resultante do forward)
mod_fwd <- glm(data=df, y ~ width + Medium, family=binomial(link="logit"))
preds2 <- predict(mod_fwd)
roc2=roc(y ~ preds2)

  
p10 <- ggplot()+
  geom_point(aes(x=(1-roc1$specificities), y=roc1$sensitivities), colour = plotly_pallette[1])+
  geom_point(aes(x=(1-roc2$specificities), y=roc2$sensitivities), colour = plotly_pallette[3])+
  scale_x_continuous(limits = c(0,1))+
  # geom_abline(intercept = 0, slope = 1, colour = plotly_pallette[2])+
  geom_line(aes(x=c(0,1), y=c(0,1)), colour = plotly_pallette[2])+
  labs(x="1-especificidade", y="sensibilidade")+
  ggtitle("Curvas ROC")+
  theme_bw()+
  theme(panel.border = element_blank())+ # para ficar igual o plotly
  theme(plot.title = element_text(hjust=0.5))

plotly::ggplotly(p10)

M3: MODELO LOGÍSTICO COM WIDTH, COLOR (NOMINAL) E INTERAÇÕES

Ajuste do modelo:

# utilizando como base a cor 'Dark' novamente
M3 <- glm(data=crabs2, y ~ width*relevel(factor(color), ref='Dark'), family=binomial(link="logit"))

summary(M3)

## 
## Call:
## glm(formula = y ~ width * relevel(factor(color), ref = "Dark"), 
##     family = binomial(link = "logit"), data = crabs2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0546  -0.9130   0.5285   0.8140   1.9657  
## 
## Coefficients:
##                                                   Estimate Std. Error
## (Intercept)                                       -5.85383    6.69393
## width                                              0.20043    0.26166
## relevel(factor(color), ref = "Dark")Lt Med         4.10122   13.27532
## relevel(factor(color), ref = "Dark")Medium        -4.18613    7.58093
## relevel(factor(color), ref = "Dark")Dk Med       -15.66423    9.56065
## width:relevel(factor(color), ref = "Dark")Lt Med  -0.09443    0.50042
## width:relevel(factor(color), ref = "Dark")Medium   0.21844    0.29524
## width:relevel(factor(color), ref = "Dark")Dk Med   0.65794    0.37533
##                                                  z value Pr(>|z|)  
## (Intercept)                                       -0.874   0.3818  
## width                                              0.766   0.4437  
## relevel(factor(color), ref = "Dark")Lt Med         0.309   0.7574  
## relevel(factor(color), ref = "Dark")Medium        -0.552   0.5808  
## relevel(factor(color), ref = "Dark")Dk Med        -1.638   0.1013  
## width:relevel(factor(color), ref = "Dark")Lt Med  -0.189   0.8503  
## width:relevel(factor(color), ref = "Dark")Medium   0.740   0.4594  
## width:relevel(factor(color), ref = "Dark")Dk Med   1.753   0.0796 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.76  on 172  degrees of freedom
## Residual deviance: 183.08  on 165  degrees of freedom
## AIC: 199.08
## 
## Number of Fisher Scoring iterations: 5

-2logLik:

# -2logLik
-2*logLik(M3)

## 'log Lik.' 183.0806 (df=8)

Int. Conf. OR:

# OR
exp(confint.default(M3)) # o metodo *.default* assume normalidade assintotica, por isso eh utilizado.

##                                                         2.5 %       97.5 %
## (Intercept)                                      5.752129e-09 1.430867e+03
## width                                            7.316783e-01 2.040661e+00
## relevel(factor(color), ref = "Dark")Lt Med       3.028073e-10 1.205339e+13
## relevel(factor(color), ref = "Dark")Medium       5.359023e-09 4.314052e+04
## relevel(factor(color), ref = "Dark")Dk Med       1.145703e-15 2.163458e+01
## width:relevel(factor(color), ref = "Dark")Lt Med 3.412170e-01 2.426326e+00
## width:relevel(factor(color), ref = "Dark")Medium 6.975291e-01 2.219082e+00
## width:relevel(factor(color), ref = "Dark")Dk Med 9.252405e-01 4.029248e+00

Teste de Razão de Verossimilhanças:

# 1) razao de verossimilhancas
#library(lmtest)
lmtest::lrtest(M3)

Teste de Wald:

# 2) teste de Wald
lmtest::waldtest(M3)

Teste Score:

# 3) Score
anova(M3,test="Rao")

Referências Bibliográficas

AGRESTI, ALAN. An Introduction to Categorical Data Analysis. John Wiley & Sons, second edition, 2007.

NOTAS DE AULA. Análise de Dados Categorizados. Curso de Graduação em Estatística, UnB, 2018.

R CORE TEAM. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.

Horseshoe Crab 01 e 02

Regressão Logística - Análise de Dados Categorizados - Prova 3

Allan Vieira - 14/0128492

Caio Balena - 13/0104566

Frederico de Lucca - 15/0072139

José Cezário - 15/0072414

Junho de 2018

MO: MODELO LOGÍSTICO SEM EFEITO (SOMENTE INTERCEPTO)

M1: MODELO LOGÍSTICO SOMENTE COM WIDTH

M2.1: MODELO LOGÍSTICO COM WIDTH E COLOR (ORDINAL)

M2.2: MODELO LOGÍSTICO COM WIDTH E COLOR (NOMINAL)

MEDIDAS E PLOTS DE DIAGNÓSTICO PARA O MODELO M2.2

EXTRA

SELEÇÃO DE VARIÁVEIS PARA O MODELO M2.2: MODELO LOGÍSTICO COM WIDTH E COLOR (NOMINAL)

M3: MODELO LOGÍSTICO COM WIDTH, COLOR (NOMINAL) E INTERAÇÕES

Referências Bibliográficas