Carregamento de pacotes:

Neste exemplo são utilizados os seguintes pacotes: dplyr, magrittr, tibble, broom, ggplot2, plotly e lmtest. Para a confecção do documento, é necessário o pacote rmarkdown.

# dplyr -- para facilitar manipulacao de dados
if (!require('dplyr')) install.packages('dplyr'); library('dplyr')

# magrittr -- para facilitar manipulacao de dados
if (!require('magrittr')) install.packages('magrittr'); library('magrittr')

# tibble -- para facilitar manipulacao de dados
if (!require('tibble')) install.packages('tibble'); library('tibble')

# broom -- para facilitar manipulacao de dados
if (!require('broom')) install.packages('broom'); library('broom')

# ggplot2 -- para os plots
if (!require('ggplot2')) install.packages('ggplot2'); library('ggplot2')

# plotly -- para os plots
if (!require('plotly')) install.packages('plotly'); library('plotly')

# lmtest -- para os testes de razao de verossimilhancas e para o teste de wald
if (!require('lmtest')) install.packages('lmtest'); library('lmtest')

Carregamento dos dados:

falhas <- c(2, 0, 0, 1, 0, 0, 1, 0, 0, 1, 2, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0)

temp <- c(53, 66, 68, 70, 75, 78, 57, 67, 69, 70, 75, 79, 58, 67, 70, 72, 76, 80, 63, 67, 70, 73, 76)

challenger <- tibble::data_frame(falhas, temp)

challenger

Plot:

plotly_pallette <- c('#1F77B4', '#FF7F0E', '#2CA02C')#, '#D62728')

# library(ggplot2)
p1 <- challenger %>%
  ggplot()+
  geom_point(aes(x = temp, y = falhas, colour = factor(falhas)))+
  labs(x="Temperature", y="Falhas")+
  ggtitle("Análises do Space Shuttle Data")+
  theme_bw()+
  theme(panel.border = element_blank())+ # para ficar igual o plotly
  guides(color=guide_legend(title=NULL))+
  scale_color_manual(values=plotly_pallette)+ # NAO FUNCIONA NO GGPLOTLY
  theme(plot.title = element_text(hjust=0.5))

plotly::ggplotly(p1)


Análises Incorretas


Análise Incorreta 1 - Regressão Linear SEM zeros

Dados:

# removendo os zeros
challenger2 <- challenger %>%
  dplyr::filter(falhas != 0)
  
challenger2

Plot:

plotly_pallette <- c('#1F77B4', '#FF7F0E', '#2CA02C')#, '#D62728')

# library(ggplot2)
p2 <- challenger2 %>%
  ggplot()+
  geom_point(aes(x = temp, y = falhas, colour = factor(falhas)))+
  labs(x="Temperature", y="Falhas")+
  ggtitle("Análises do Space Shuttle Data - Análise Incorreta 1 - Dados SEM zeros")+
  theme_bw()+
  theme(panel.border = element_blank())+ # para ficar igual o plotly
  guides(color=guide_legend(title=NULL))+
  scale_color_manual(values=plotly_pallette)+ # NAO FUNCIONA NO GGPLOTLY
  theme(plot.title = element_text(hjust=0.5))

plotly::ggplotly(p2)

Modelo de Regressão Linear SEM zeros:

orings = 6
challenger2 <- challenger2 %>%
  dplyr::mutate(resp = falhas/orings)

# sum of squares
aov(resp ~ temp, data = challenger2)
## Call:
##    aov(formula = resp ~ temp, data = challenger2)
## 
## Terms:
##                       temp  Residuals
## Sum of Squares  0.00002271 0.03965983
## Deg. of Freedom          1          5
## 
## Residual standard error: 0.08906159
## Estimated effects may be unbalanced
# regressao
fit1 <- lm(resp ~ temp, data = challenger2)
summary(fit1)
## 
## Call:
## lm(formula = resp ~ temp, data = challenger2)
## 
## Residuals:
##        1        2        3        4        5        6        7 
##  0.12160 -0.04912 -0.04602 -0.04912  0.11636 -0.04626 -0.04745 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1990939  0.2859162   0.696    0.517
## temp        0.0002384  0.0044563   0.054    0.959
## 
## Residual standard error: 0.08906 on 5 degrees of freedom
## Multiple R-squared:  0.0005722,  Adjusted R-squared:  -0.1993 
## F-statistic: 0.002863 on 1 and 5 DF,  p-value: 0.9594

Plot:

plotly_pallette <- c('#1F77B4', '#FF7F0E', '#2CA02C')#, '#D62728')

# library(ggplot2)
p3 <- challenger2 %>%
  ggplot()+
  # geom_point(aes(x = temp, y = falhas, colour = factor(falhas)))+
  geom_point(aes(x = temp, y = resp), colour = plotly_pallette[1])+
  geom_line(aes(x=temp, y = fitted(fit1)), colour = plotly_pallette[2])+
  labs(x="Temperature", y="Prob. Estimada de Falha")+
  ggtitle("NASA Space Shuttle O-Ring Failures - Erro 1")+
  theme_bw()+
  theme(panel.border = element_blank())+ # para ficar igual o plotly
  guides(color=guide_legend(title=NULL))+
  scale_color_manual(values=plotly_pallette)+ # NAO FUNCIONA NO GGPLOTLY
  theme(plot.title = element_text(hjust=0.5))

plotly::ggplotly(p3)

Função para automatizar plots diagnósticos:

Neste exemplo, foi confeccionada uma função parar automatizar os plots diagnósticos para regressão linear.

RegressionPlots <- function(fit, range.plt5){
  
  model.data <- broom::augment(fit)

  Theoretical.Quantiles <- qqnorm(model.data$.resid, plot.it = F)$x

  model.data <- cbind(model.data, Theoretical.Quantiles)

  # selecionar soh os que interessam
  
  regMat <- model.data %>%
    dplyr::select(resp, .fitted, .resid, .hat, .cooksd, .std.resid, Theoretical.Quantiles ) #%>%
    # magrittr::set_colnames()
  
  plotly_pallette <- c('#1F77B4', '#FF7F0E', '#2CA02C', '#D62728')
  
  plt1 <- regMat %>%
    ggplot()+
    geom_point(aes(x = .fitted, y = .resid), colour = plotly_pallette[1], alpha = 0.7)+
    geom_hline(yintercept = 0, alpha = 0.5, size=0.4)+
    ggtitle("Residuals vs Fitted Values")+
    theme_bw()+
    theme(panel.border = element_blank())+ # para ficar igual o plotly
    guides(color=guide_legend(title=NULL))+
    scale_color_manual(values=plotly_pallette)+ # NAO FUNCIONA NO GGPLOTLY
    theme(plot.title = element_text(hjust=0.5))
    
  
  plt2 <- regMat %>%
    ggplot()+
    geom_point(aes(x = .fitted, y = .std.resid), colour = plotly_pallette[1], alpha = 0.7)+
    geom_hline(yintercept = 0, alpha = 0.5, size=0.4)+
    ggtitle("Standardized Residuals vs Fitted Values")+
    theme_bw()+
    theme(panel.border = element_blank())+ # para ficar igual o plotly
    guides(color=guide_legend(title=NULL))+
    scale_color_manual(values=plotly_pallette)+ # NAO FUNCIONA NO GGPLOTLY
    theme(plot.title = element_text(hjust=0.5))
  
  
  plt3 <- regMat %>%
    ggplot()+
    geom_point(aes(x = .hat, y = .std.resid), colour = plotly_pallette[1], alpha = 0.7)+
    geom_hline(yintercept = 0, alpha = 0.5, size=0.4)+
    ggtitle("Leverage vs Standardized Residuals")+
    theme_bw()+
    theme(panel.border = element_blank())+ # para ficar igual o plotly
    guides(color=guide_legend(title=NULL))+
    scale_color_manual(values=plotly_pallette)+ # NAO FUNCIONA NO GGPLOTLY
    theme(plot.title = element_text(hjust=0.5))
  
  
  plt4 <- regMat %>%
    ggplot()+
    geom_point(aes(x = Theoretical.Quantiles, y = .std.resid), colour = plotly_pallette[1], alpha = 0.7)+
    geom_line(aes(x = Theoretical.Quantiles, y = Theoretical.Quantiles), colour = plotly_pallette[2])+
    geom_hline(yintercept = 0, alpha = 0.5, size=0.4)+
    geom_vline(xintercept=0, alpha = 0.5, size=0.4)+
    ggtitle("Standardized Residuals vs Theoretical Quantiles")+
    theme_bw()+
    theme(panel.border = element_blank())+ # para ficar igual o plotly
    guides(color=guide_legend(title=NULL))+
    scale_color_manual(values=plotly_pallette)+ # NAO FUNCIONA NO GGPLOTLY
    theme(plot.title = element_text(hjust=0.5))
  
  
  plt5 <- regMat %>%
    ggplot()+
    scale_x_continuous(limits = range.plt5)+
    geom_point(aes(x = .fitted, y = resp), colour = plotly_pallette[1], alpha = 0.7)+
    geom_abline(intercept=0, slope=1, colour = plotly_pallette[2])+
    ggtitle("Resp vs Fitted Values")+
    theme_bw()+
    theme(panel.border = element_blank())+ # para ficar igual o plotly
    guides(color=guide_legend(title=NULL))+
    scale_color_manual(values=plotly_pallette)+ # NAO FUNCIONA NO GGPLOTLY
    theme(plot.title = element_text(hjust=0.5))
  

  obs <- seq(1, nrow(fit$model), by =1)
  
  plt6 <- regMat %>%
    ggplot()+
    geom_point(aes(x = obs, y = .cooksd), colour = plotly_pallette[1], alpha = 0.7)+
    geom_segment(aes(x= obs, xend = obs, y = 0, yend = .cooksd), colour="grey")+
    ggtitle("Cook's distance")+
    theme_bw()+
    theme(panel.border = element_blank())+ # para ficar igual o plotly
    guides(color=guide_legend(title=NULL))+
    scale_color_manual(values=plotly_pallette)+ # NAO FUNCIONA NO GGPLOTLY
    theme(plot.title = element_text(hjust=0.5))
  
  
  
  
  return(list(plt1, plt2, plt3, plt4, plt5, plt6))

}

Plots diagnósticos:

plt <- RegressionPlots(fit1, range.plt5 = c(0.15, 0.35))
plotly::ggplotly(plt[[1]])
plotly::ggplotly(plt[[3]])
plotly::ggplotly(plt[[5]])
plotly::ggplotly(plt[[2]])
plotly::ggplotly(plt[[4]])
plotly::ggplotly(plt[[6]])

Análise Incorreta 2 - Regressão Linear COM zeros

Modelo de Regressão Linear COM zeros:

# dados
orings = 6
challenger <- challenger %>%
  dplyr::mutate(resp = falhas/orings)

# removendo os zeros
# sum of squares
aov(resp ~ temp, data = challenger)
## Call:
##    aov(formula = resp ~ temp, data = challenger)
## 
## Terms:
##                      temp Residuals
## Sum of Squares  0.0691364 0.1941486
## Deg. of Freedom         1        21
## 
## Residual standard error: 0.09615182
## Estimated effects may be unbalanced
# regressao
fit2 <- lm(resp ~ temp, data = challenger)
summary(fit2)
## 
## Call:
## lm(formula = resp ~ temp, data = challenger)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.09348 -0.06539 -0.01323  0.01485  0.31207 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  0.623077   0.204982   3.040  0.00623 **
## temp        -0.008024   0.002934  -2.735  0.01242 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09615 on 21 degrees of freedom
## Multiple R-squared:  0.2626, Adjusted R-squared:  0.2275 
## F-statistic: 7.478 on 1 and 21 DF,  p-value: 0.01242

Plot:

# library(ggplot2)
p4 <- challenger %>%
  ggplot()+
  # geom_point(aes(x = temp, y = falhas, colour = factor(falhas)))+
  geom_point(aes(x = temp, y = resp), colour = plotly_pallette[1])+
  geom_line(aes(x=temp, y = fitted(fit2)), colour = plotly_pallette[2])+
  # geom_ribbon(data=new.data, aes(y=fit, ymin=ymin, ymax=ymax), alpha=0.5) + 
  geom_ribbon(data=broom::augment(fit2),
              aes(x = temp, y = .fitted, ymin = .fitted - qnorm(0.975) * .se.fit,
              ymax = .fitted + qnorm(0.975) * .se.fit), alpha = 0.5, 
              fill = '#FFF0F5')+
  labs(x="Temperature", y="Prob. Estimada de Falha")+
  ggtitle("NASA Space Shuttle O-Ring Failures - Erro")+
  theme_bw()+
  theme(panel.border = element_blank())+ # para ficar igual o plotly
  guides(color=guide_legend(title=NULL))+
  scale_color_manual(values=plotly_pallette)+ # NAO FUNCIONA NO GGPLOTLY
  theme(plot.title = element_text(hjust=0.5))

plotly::ggplotly(p4)

Plots diagnósticos:

plt2 <- RegressionPlots(fit2, range.plt5 = c(0, 0.35))
plotly::ggplotly(plt2[[1]])
plotly::ggplotly(plt2[[3]])
plotly::ggplotly(plt2[[5]])
plotly::ggplotly(plt2[[2]])
plotly::ggplotly(plt2[[4]])
plotly::ggplotly(plt2[[6]])


Análise Correta


Regressão Logística com zeros

Modelo de Regressão Logística (Binomial Regression):

# precisamos passar os pesos - contagens para a funcao glm!!!
# eh um caso de utilizacao da funcao de veross da binomial mesmo (k sucessos em n trials)
#...e nao da bernoulli (sucesso ou fracasso para cada trial)
# ver: https://stats.stackexchange.com/questions/90622/regression-model-where-output-is-a-probability
# e https://www.r-bloggers.com/binomial-regression-model/
# como dividimos todas as falhas por 6 -- nossos pesos sao todos 6
# ai bate exatamente com o SAS, inclusive SE e AIC

fit3 <- glm(resp ~ temp, data = challenger, weights = rep(6, nrow(challenger)),  family=binomial(link="logit"))

summary(fit3)
## 
## Call:
## glm(formula = resp ~ temp, family = binomial(link = "logit"), 
##     data = challenger, weights = rep(6, nrow(challenger)))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.95159  -0.78221  -0.54035  -0.04366   2.65354  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  5.09395    3.05524   1.667   0.0955 .
## temp        -0.11576    0.04707  -2.459   0.0139 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24.230  on 22  degrees of freedom
## Residual deviance: 18.106  on 21  degrees of freedom
## AIC: 35.667
## 
## Number of Fisher Scoring iterations: 5

-2logLik:

# -2logLik
-2*logLik(fit3)
## 'log Lik.' 31.66661 (df=2)

Int. Conf. OR:

# OR
exp(confint.default(fit3)) # o metodo *.default* assume normalidade assintotica, por isso eh utilizado.
##                 2.5 %       97.5 %
## (Intercept) 0.4089341 6.499780e+04
## temp        0.8122004 9.767629e-01

Teste de Razão de Verossimilhanças:

# 1) razao de verossimilhancas
#library(lmtest)
lmtest::lrtest(fit3)

Teste de Wald:

# 2) teste de Wald
lmtest::waldtest(fit3)

Teste Score:

# 3) Score
anova(fit3,test="Rao")

Ajustes pré-plotagem:

# criando obs artificiais para suavizar a curva
x_limits <- challenger %>%
  dplyr::summarise(min = 30, max = max(temp)+2) # obtendo os limites do eixo x

x <- seq(x_limits[[1]], x_limits[[2]], by=0.5) # criando varios ptos entre esses limites

# obtendo os valores de y de acordo com os parametros da reg logist
# y <- plogis(fit3$coefficients[1]+fit3$coefficients[2]*x)


# fazendo a predicao das obs artificiais
# ver sol: https://stackoverflow.com/questions/26694931/how-to-plot-logit-and-probit-in-ggplot2
temp.data = data.frame(temp = x) # se nao passar a coluna com o mesmo nome da variavel nao funciona


# Predict the fitted values given the model and hypothetical data
predicted.data <- as.data.frame(predict(fit3, newdata = temp.data, 
                                        type="link", se=TRUE))

# Combine the hypothetical data and predicted values
new.data <- cbind(temp.data, predicted.data)

# Calculate confidence intervals
std <- qnorm(0.95 / 2 + 0.5)
new.data$ymin <- fit3$family$linkinv(new.data$fit - std * new.data$se)
new.data$ymax <- fit3$family$linkinv(new.data$fit + std * new.data$se)
new.data$fit <- fit3$family$linkinv(new.data$fit)  # Rescale to 0-1

Plot:

p5 <- ggplot(challenger, aes(x=temp, y=resp)) 
p5 <- p5 + geom_point(colour = plotly_pallette[1]) + 
  geom_ribbon(data=new.data, 
              aes(y=fit, 
                  ymin=ymin, ymax=ymax), alpha = 0.5, 
              fill = '#FFF0F5')+
  geom_line(data=new.data, aes(y=fit), colour = plotly_pallette[2]) + 
  labs(x="Temperature", y="Prob. Estimada de Falha")+
  ggtitle("Predicted Probabilities for falhas/orings With 95% Confidence Limits")+
  theme_bw()+
  theme(panel.border = element_blank())+ # para ficar igual o plotly
  theme(plot.title = element_text(hjust=0.5))
  

plotly::ggplotly(p5)


Referências Bibliográficas


AGRESTI, ALAN. An Introduction to Categorical Data Analysis. John Wiley & Sons, second edition, 2007.

NOTAS DE AULA. Análise de Dados Categorizados. Curso de Graduação em Estatística, UnB, 2018.

R CORE TEAM. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.