National Park Service.  

USGS Status and Trends of Biological Resources   -   NPS Inventory and Monitoring

Learn R

R is a free software environment for statistical computing and graphics.
http://www.r-project.org/
Home | Getting Started | Schedule | References | FAQ | Discussion | Tom's site
For more information, please contact Paul Geissler (Paul_Geissler@usgs.gov).

Topic 14: Proportion Data,

Michael Crawley, 2007,The R Book, Chapter 16.
See the text for descriptions. You can copy and paste the statements below into thr R Commander script window and execute them.
Anything after # on a line is a comment. I have added annotations as comments and shown the output as comments following each command.

Use when you know how many are in contrasting categories (alive/dead, male/female, infected/uninfected). With Poisson count date, we know how many times an event occurred, but not how many times it did NOT occur.

rm(list = ls()) # removes previous variables
d=read.table("http://www.bio.ic.ac.uk/research/mjcraw/therbook/data/sexratio.txt",header=T); attach(d); d
#   density females males # sex ratio in insects
# 1       1       1     0
# 2       4       3     1
# 3      10       7     3
# 4      22      18     4
# 5      55      22    33
# 6     121      41    80
# 7     210      52   158
# 8     444      79   365

par(mfrow=c(1,2)); p=males/(males+females)
plot(density,p,ylab="proportion male")
plot(log(density),p,ylab="proportion male"); par(mfrow=c(1,1))
R graph
y=cbind(males,females)
m1=glm(y~density,binomial); summary(m1)
# Coefficients:
#              Estimate Std. Error z value Pr(>|z|)    
# (Intercept) 0.0807368  0.1550376   0.521    0.603    
# density     0.0035101  0.0005116   6.862 6.81e-12 ***
# (Dispersion parameter for binomial family taken to be 1)
#     Null deviance: 71.159  on 7  degrees of freedom
# Residual deviance: 22.091  on 6  degrees of freedom # should equal df
# AIC: 54.618
m2=glm(y~log(density),binomial); summary(m2)
# Coefficients:
#              Estimate Std. Error z value Pr(>|z|)    
# (Intercept)  -2.65927    0.48758  -5.454 4.92e-08 ***
# log(density)  0.69410    0.09056   7.665 1.80e-14 ***
# (Dispersion parameter for binomial family taken to be 1)
#     Null deviance: 71.1593  on 7  degrees of freedom
# Residual deviance:  5.6739  on 6  degrees of freedom  # should equal df
# AIC: 38.201 # better model

x=seq(0,6,01); plot(log(density),p,ylab="proportion male"); 
lines(x,predict(m2,list(density=exp(x)),type="response"))
R graph

#### estimating LD50 and LD90 from bioassay date ########################### page 576
rm(list = ls()) # removes previous variables
d=read.table("http://www.bio.ic.ac.uk/research/mjcraw/therbook/data/bioassay.txt",header=T); attach(d); d
#   dose dead batch # size
# 1    1    2   100
# 2    3   10    90
# 3   10   40    98
# 4   30   96   100
# 5  100   98   100
y=cbind(dead,batch-dead); m=glm(y~log(dose),binomial); dose.p(m,p=c(0.5,0.9,0.95))
#               Dose         SE
# p = 0.50: 2.306981 0.07772065 # LD50 lethal dose 50% of population
# p = 0.90: 3.425506 0.12362080 # LD90
# p = 0.95: 3.805885 0.15150043 # LD95

#### proportion data with categorical explanatory variables ##################  page 577
rm(list = ls()) # removes previous variables
d=read.table("http://www.bio.ic.ac.uk/research/mjcraw/therbook/data/germination.txt",header=T); attach(d); d
# Germanation of seeds of two genotypes of the parasitic plant Orobanche and two extracts from 
# host plants (bean and cucomber) that were used to stimulate germination.  Two-way factorial analysis of deviance.
#    count sample Orobanche  extract 
# 1     10     39       a75     bean
# . . .
y=cbind(count,sample-count); m1=glm(y~Orobanche*extract,binomial); summary(m1)
# Residual deviance: 33.278  on 17  degrees of freedom # overdispersion - they should be equal
m1=glm(y~Orobanche*extract,quasibinomial); summary(m1)
# Coefficients:
#                                      Estimate Std. Error t value Pr(>|t|)  
# (Intercept)                           -0.4122     0.2513  -1.640   0.1193  
# Orobanche[T.a75]                      -0.1459     0.3045  -0.479   0.6379  
# extract[T.cucumber]                    0.5401     0.3409   1.584   0.1315  
# Orobanche[T.a75]:extract[T.cucumber]   0.7781     0.4181   1.861   0.0801 .
# (Dispersion parameter for quasibinomial family taken to be 1.861832)
#     Null deviance: 98.719  on 20  degrees of freedom
# Residual deviance: 33.278  on 17  degrees of freedom # should equal df, use quasibinomial
m2=glm(y~Orobanche+extract,quasibinomial); summary(m2)
# Coefficients:
#                     Estimate Std. Error t value Pr(>|t|)    
# (Intercept)          -0.7005     0.2199  -3.186  0.00512 ** 
# Orobanche[T.a75]      0.2705     0.2257   1.198  0.24635    
# extract[T.cucumber]   1.0647     0.2104   5.061 8.14e-05 ***
# (Dispersion parameter for quasibinomial family taken to be 2.128368)
#     Null deviance: 98.719  on 20  degrees of freedom
# Residual deviance: 39.686  on 18  degrees of freedom
anova(m1,m2,test="F") # use F test with quasibinomial
# Analysis of Deviance Table
# Model 1: y ~ Orobanche * extract
# Model 2: y ~ Orobanche + extract
#   Resid. Df Resid. Dev Df Deviance      F  Pr(>F)  
# 1        17     33.278                             
# 2        18     39.686 -1   -6.408 3.4418 0.08099 . # interaction is not significant
# No evidence of different responses to extracts.
anova(m2,test="F")
# Analysis of Deviance Table
# Terms added sequentially (first to last)
#           Df Deviance Resid. Df Resid. Dev       F    Pr(>F)    
# NULL                         20     98.719                      
# Orobanche  1    2.544        19     96.175  1.1954    0.2887    
# extract    1   56.489        18     39.686 26.5412 6.692e-05 ***
m3=glm(y~extract,quasibinomial); anova(m2,m3,test="F")
# Analysis of Deviance Table
# Model 1: y ~ Orobanche + extract
# Model 2: y ~ extract
#   Resid. Df Resid. Dev Df Deviance      F Pr(>F)
# 1        18     39.686                          
# 2        19     42.751 -1   -3.065 1.4401 0.2457
summary(m3) # final model
# Coefficients:
#                     Estimate Std. Error t value Pr(>|t|)    
# (Intercept)          -0.5122     0.1531  -3.345   0.0034 **   # estimates on transformed scale
# extract[T.cucumber]   1.0574     0.2118   4.992  8.1e-05 ***
# (Dispersion parameter for quasibinomial family taken to be 2.169821)
#     Null deviance: 98.719  on 20  degrees of freedom
# Residual deviance: 42.751  on 19  degrees of freedom
tapply(predict(m3,type="response"),extract,mean)
#      bean  cucumber # predicted proportions germinate
# 0.3746835 0.6330275 
tapply(count,extract,sum)/tapply(sample,extract,sum)
     bean  cucumber  # observed proportions (divide totals)
0.3746835 0.6330275 

#### analysis of covariance with binomial data ##########################  page 581
# Flowering in 5 varities of perennial plats, sprayed with growth promoters.
rm(list = ls()) # removes previous variables
d=read.table("http://www.bio.ic.ac.uk/research/mjcraw/therbook/data/flowering.txt",header=T); attach(d); d
#    flowered number dose variety
# 1         0     12    1       A
# . . .
y=cbind(flowered, number-flowered); pf=flowered/number; scatterplot(pf~dose|variety)
R graph
m=glm(y~dose*variety,binomial); summary(m)
# Coefficients:
#                   Estimate Std. Error z value Pr(>|z|)    
# (Intercept)       -4.59165    1.03215  -4.449 8.64e-06 ***
# dose               0.41262    0.10033   4.113 3.91e-05 ***
# variety[T.B]       3.06197    1.09317   2.801 0.005094 ** 
# variety[T.C]       1.23248    1.18812   1.037 0.299576    
# variety[T.D]       3.17506    1.07516   2.953 0.003146 ** 
# variety[T.E]      -0.71466    1.54849  -0.462 0.644426    
# dose:variety[T.B] -0.34282    0.10239  -3.348 0.000813 ***
# dose:variety[T.C] -0.23039    0.10698  -2.154 0.031274 *  
# dose:variety[T.D] -0.30481    0.10257  -2.972 0.002961 ** 
# dose:variety[T.E] -0.00649    0.13292  -0.049 0.961057    
# (Dispersion parameter for binomial family taken to be 1)
#     Null deviance: 303.350  on 29  degrees of freedom
# Residual deviance:  51.083  on 20  degrees of freedom # overdispersion may be due to lack of fit.

## run a model for each variety separately
plt=function(v){
   ty=y[variety==v,]; td=dose[variety==v]; mt=glm(ty~td,binomial)  
   pd=seq(0,32,0.2); pp=predict(mt,list(td=pd),type="response")
   p=ty[,1]/(ty[,1]+ty[,2]); plot(p~td); lines(pd,pp)
   }
par(mfrow=c(2,3)); plt("A");plt("B"); plt("C"); plt("D"); plt("E"); par(mfrow=c(1,1))
R graph
## Some varieties do not fit the logistic model and need a "hump". Add I(td^2)
plt=function(v){
   ty=y[variety==v,]; td=dose[variety==v]; mt=glm(ty~td+I(td^2),binomial)  
   pd=seq(0,32,0.2); pp=predict(mt,list(td=pd),type="response")
   p=ty[,1]/(ty[,1]+ty[,2]); plot(p~td); lines(pd,pp)
   }
par(mfrow=c(2,3)); plt("A");plt("B"); plt("C"); plt("D"); plt("E"); par(mfrow=c(1,1))
R graph

m=glm(y~dose*variety+I(dose^2),binomial); summary(m)
# Residual deviance:  33.641  on 19  degrees of freedom # overdispersion
m1=glm(y~dose*variety+I(dose^2),quasibinomial); summary(m1)
# Coefficients:
#                    Estimate Std. Error t value Pr(>|t|)   
# (Intercept)       -5.081310   1.524669  -3.333  0.00350 **
# dose               0.539558   0.149004   3.621  0.00182 **
# variety[T.B]       2.787401   1.616284   1.725  0.10084   
# variety[T.C]       0.345351   1.837732   0.188  0.85293   
# variety[T.D]       2.761404   1.588723   1.738  0.09837 . 
# variety[T.E]      -0.949808   2.233017  -0.425  0.67536   
# I(dose^2)         -0.006167   0.002211  -2.789  0.01171 * 
# dose:variety[T.B] -0.259977   0.146017  -1.780  0.09100 . 
# dose:variety[T.C] -0.141160   0.153381  -0.920  0.36894   
# dose:variety[T.D] -0.228755   0.145574  -1.571  0.13259   
# dose:variety[T.E]  0.012697   0.184766   0.069  0.94593   
## We can now proceed with model simplification.

#### converting complex contegency tables to proportions ############################## page 584
rm(list = ls()) # removes previous variables
d=read.table("http://www.bio.ic.ac.uk/research/mjcraw/therbook/data/lizards.txt",header=T); attach(d); d
#     n   sun height  perch      time  species # Schoener's lizards 
# 1  20 Shade   High  Broad   Morning opalinus
# . . .
ds=d[order(species,sun,height,perch,time),];  attach(ds); # must be in order to use cbind
g=ds[species=="grahamii",]; o=ds[species=="opalinus",]
liz=data.frame(sun=g$sun,height=g$height,perch=g$perch,time=g$time,ag=g$n,ao=o$n); attach(liz); liz
#      sum height  perch      time ag ao
# 1  Shade   High  Broad Afternoon  4  4
# . . .
y=cbind(ag,ao); m1=glm(y~sun*height*perch*time,binomial); m2=step(m1); summary(m2)
# Start:  AIC=102.82
#                         Df  Deviance    AIC
# - sun:height:perch:time  1 2.180e-10 100.82
# <none>                     3.582e-10 102.82
# Step:  AIC=100.82
#                     Df Deviance     AIC
# - sun:height:time    2    0.442  97.266
# - sun:perch:time     2    0.810  97.634
# - height:perch:time  2    3.222 100.046
# <none>                 2.18e-10 100.824
# - sun:height:perch   1    2.709 101.533
# Step:  AIC=97.27
#                     Df Deviance    AIC
# - sun:perch:time     2    1.071 93.896
# <none>                    0.442 97.266
# - height:perch:time  2    4.648 97.472
# - sun:height:perch   1    3.111 97.936
# Step:  AIC=93.9
#                     Df Deviance    AIC
# - sun:time           2    3.340 92.165
# <none>                    1.071 93.896
# - sun:height:perch   1    3.302 94.126
# - height:perch:time  2    5.791 94.615
# Step:  AIC=92.16
#                     Df Deviance    AIC
# <none>                    3.340 92.165
# - sun:height:perch   1    5.827 92.651
# - height:perch:time  2    8.542 93.366
# Call:
# glm(formula = y ~ sun + height + perch + time + sun:height + 
#     sun:perch + height:perch + height:time + perch:time + sun:height:perch + 
#     height:perch:time, family = binomial)
# Coefficients:
#                                                Estimate Std. Error z value Pr(>|z|)  
# (Intercept)                                     -0.8297     0.5120  -1.620   0.1051  
# sun[T.Sun]                                       0.4791     0.4744   1.010   0.3126  
# height[T.Low]                                  -20.9069  6551.9690  -0.003   0.9975  
# perch[T.Narrow]                                  0.1751     0.7481   0.234   0.8149  
# time[T.Mid.day]                                 -0.9101     0.4272  -2.130   0.0331 *
# time[T.Morning]                                 -0.9314     0.4637  -2.009   0.0445 *
# sun[T.Sun]:height[T.Low]                        19.7912  6551.9690   0.003   0.9976  
# sun[T.Sun]:perch[T.Narrow]                       0.2416     0.6987   0.346   0.7295  
# height[T.Low]:perch[T.Narrow]                   21.1239  6551.9691   0.003   0.9974  
# height[T.Low]:time[T.Mid.day]                   -0.2446     0.9279  -0.264   0.7920  
# height[T.Low]:time[T.Morning]                    0.5732     0.9260   0.619   0.5359  
# perch[T.Narrow]:time[T.Mid.day]                  0.2140     0.6486   0.330   0.7415  
# perch[T.Narrow]:time[T.Morning]                  0.7106     0.6980   1.018   0.3087  
# sun[T.Sun]:height[T.Low]:perch[T.Narrow]       -19.9665  6551.9691  -0.003   0.9976  
# height[T.Low]:perch[T.Narrow]:time[T.Mid.day]   -0.6021     1.3489  -0.446   0.6553  
# height[T.Low]:perch[T.Narrow]:time[T.Morning]   -3.2054     1.6106  -1.990   0.0466 *
# (Dispersion parameter for binomial family taken to be 1)
#     Null deviance: 70.1018  on 22  degrees of freedom
# Residual deviance:  3.3403  on  7  degrees of freedom  # Crawley does not use quasibinomial
# AIC: 92.165
m3=step(m1,k=log(length(sun))); summary(m3) # using BIC does not select an many parameters
# Start:  AIC=131.1
#                         Df  Deviance    AIC
# - sun:height:perch:time  1 2.180e-10 127.92
# <none>                     3.582e-10 131.10
# Step:  AIC=127.92
#                     Df Deviance     AIC
# - sun:height:time    2    0.442 122.005
# - sun:perch:time     2    0.810 122.373
# - height:perch:time  2    3.222 124.785
# - sun:height:perch   1    2.709 127.450
# <none>                 2.18e-10 127.919
# Step:  AIC=122
#                     Df Deviance     AIC
# - sun:perch:time     2    1.071 116.279
# - height:perch:time  2    4.648 119.855
# - sun:height:perch   1    3.111 121.497
# <none>                    0.442 122.005
# Step:  AIC=116.28
#                     Df Deviance     AIC
# - sun:time           2    3.340 112.191
# - height:perch:time  2    5.791 114.642
# - sun:height:perch   1    3.302 115.331
# <none>                    1.071 116.279
# Step:  AIC=112.19
#                     Df Deviance     AIC
# - height:perch:time  2    8.542 111.037
# - sun:height:perch   1    5.827 111.500
# <none>                    3.340 112.191
# Step:  AIC=111.04
#                    Df Deviance     AIC
# - perch:time        2    8.551 104.690
# - height:time       2    9.483 105.622
# - sun:height:perch  1   10.903 110.220
# <none>gt;                   8.542 111.037
# Step:  AIC=104.69
#                    Df Deviance     AIC
# - height:time       2    9.491  99.274
# - sun:height:perch  1   10.909 103.870
# <none>                   8.551 104.690
# Step:  AIC=99.27
#                    Df Deviance     AIC
# - sun:height:perch  1   11.767  98.371
# <none>                   9.491  99.274
# - time              2   21.278 104.704
# Step:  AIC=98.37
#                Df Deviance     AIC
# - sun:perch     1   11.783  95.210
# - height:perch  1   11.979  95.406
# - sun:height    1   13.848  97.275
# <none>              11.767  98.371
# - time          2   23.245 103.493
# Step:  AIC=95.21
#                Df Deviance     AIC
# - height:perch  1   11.984  92.233
# - sun:height    1   13.923  94.172
# <none>              11.783  95.210
# - time          2   23.385 100.456
# Step:  AIC=92.23
#              Df Deviance     AIC
# - sun:height  1   14.205  91.275
# <none>            11.984  92.233
# - time        2   23.714  97.606
# - perch       1   24.921 101.991
# Step:  AIC=91.28
#          Df Deviance     AIC
# <none>        14.205  91.275
# - sun     1   21.892  95.784
# - time    2   25.802  96.516
# - perch   1   27.335 101.227
# - height  1   36.271 110.163
# Call:
# glm(formula = y ~ sun + height + perch + time, family = binomial)
# Coefficients:
#                 Estimate Std. Error z value Pr(>|z|)    
# (Intercept)      -1.2079     0.3536  -3.416 0.000634 ***
# sun[T.Sun]        0.8473     0.3224   2.628 0.008585 ** 
# height[T.Low]    -1.1300     0.2571  -4.395 1.11e-05 ***
# perch[T.Narrow]   0.7626     0.2113   3.610 0.000306 ***
# time[T.Mid.day]  -0.9639     0.2816  -3.423 0.000619 ***
# time[T.Morning]  -0.7368     0.2990  -2.464 0.013730 *  
# (Dispersion parameter for binomial family taken to be 1)
#     Null deviance: 70.102  on 22  degrees of freedom
# Residual deviance: 14.205  on 17  degrees of freedom
# AIC: 83.029
anova(m2,test="Chisq")
# Analysis of Deviance Table
# Model: binomial, link: logit
# Response: y
# Terms added sequentially (first to last)
#                   Df Deviance Resid. Df Resid. Dev P(>|Chi|)
# NULL                                 22     70.102          
# sun                1    6.472        21     63.629     0.011
# height             1   26.134        20     37.495 3.185e-07
# perch              1   11.694        19     25.802     0.001
# time               2   11.597        17     14.205     0.003
# sun:height         1    2.220        16     11.984     0.136
# sun:perch          1    0.005        15     11.979     0.941
# height:perch       1    0.212        14     11.767     0.645
# height:time        2    0.858        12     10.909     0.651
# perch:time         2    0.006        10     10.903     0.997
# sun:height:perch   1    2.361         9      8.542     0.124
# height:perch:time  2    5.201         7      3.340     0.074
anova(m3,test="Chisq")
# Analysis of Deviance Table
# Model: binomial, link: logit
# Response: y
# Terms added sequentially (first to last)
#        Df Deviance Resid. Df Resid. Dev P(>|Chi|) # AIC and BIC give very different models
# NULL                      22     70.102          
# sun     1    6.472        21     63.629     0.011
# height  1   26.134        20     37.495 3.185e-07
# perch   1   11.694        19     25.802     0.001
# time    2   11.597        17     14.205     0.003

Thanks to USGS Fort Collins Science Center for hosting this page for the USGS Biology Science Staff.

Accessibility FOIA Privacy Policies and Notices

Take Pride in America logo USA.gov logo U.S. Department of the Interior | U.S. Geological Survey
URL: <%Response.Write(Request.Url)%>   Page Contact Information: Paul_Geissler@usgs.gov