Lobry & Chessel (2003) JAG 44:235

This page allows for the on-line reproduction of the figures in the paper: Lobry, J.R., Chessel, D. (2003) Internal correspondence analysis of codon and amino-acid usage in thermophilic bacteria. Journal of Applied Genetics, 44:235-261. [DATASET] [PDF]

Abstract: Starting from two datasets of codon usage in coding sequences from mesophilic and thermophilic bacteria we have used internal correspondence analysis to study the variability of codon usage within and between species, and within and between amino-acids. The first dataset included 18,958,458 codons from 58,482 coding sequences from 25 completely sequenced genomes along with 6,793,581 dinucleotides from 21,876 intergenic spaces. A second dataset with partially sequenced genomes included 97,095,873 codons from 293 bacterial species. Results were consistent between the two datasets. The trend for the amino-acid composition of thermophilic proteins was found to be under the control of a pressure at the nucleic acid level, not a selection at the protein level. This effect was no more present in intergenic spaces ruling out a pressure at the DNA level. The pattern at the mRNA level was more complex than a simple purine enrichment of sense strand of coding sequences. Outliers in the partial genome dataset introduced a note of caution about the interpretation of temperature as the direct determinant of the trend observed in thermophiles. The surprising lack of selection on the amino acid content in thermophilic proteins suggested that the amino acid repertoire was set up in a hot environment.

#
# Insert 1 (Temperature and growth)
#
barber <- read.table(file="http://pbil.univ-lyon1.fr/R/donnees/barber.txt",h=TRUE)
plot(barber,
     main = "Influence of temperature on E. coli growth",
     pch = 19,
     cex = 0.8,
     ylab = "Specific growth rate [1/h]",
     xlab = "Temperature [°C]")

#
# Define CTMI model:
#
CTMI <- function(T, param)
{
  Tmin <- param[1]
  Topt <- param[2]
  Tmax <- param[3]
  Muopt <- param[4]
  if( T <= Tmin || T >= Tmax )
  {
    return(0)
  }
  else
  {
    Num <- (T-Tmax)*(T-Tmin)^2
    Den <- (Topt-Tmin)*((Topt-Tmin)*(T-Topt)-(Topt-Tmax)*(Topt+Tmin-2*T))
    return(Muopt*Num/Den)
  }
}

#
# Define Sum of Squared Residuals for CTMI model:
#
sceCTMI <- function(param, data)
{
  xobs <- data[,1]
  yobs <- data[,2]
  ytheo <- sapply(xobs, CTMI, param)
  return( sum((yobs-ytheo)^2) )
}

#
# Estimate parameter values:
#
guess <- c( 10, 40, 50, 2.5 )
est <- nlm( sceCTMI, p=guess, data = barber )$estimate
xaxis <- seq(min(barber$Temp),max(barber$Temp),by=1)
ytheo <- sapply( xaxis, function(x) { CTMI(x, est) } )

#
# Plot best fit:
#
lines(xaxis, ytheo)

#
# Insert 2 (surjective genetic codes)
#
numcode <- 1 # To choose the genetic code

#
# General layout
#
symbols(x = rep(0,3), y = rep(0,3), 
        circles = c(1, 0.75, 0.45), 
        inches = FALSE,
        bg = c("pink", "white", "lightblue"), 
        xlim = c(-1, 1),
        ylim = c(-1, 1),
        bty = "n",
        asp = 1,
        main = paste("The surjective nature of genetic codes\nGenetic code number", numcode),
        xlab = "", ylab= "",
        xaxt ="n", yaxt = "n")
title( sub = "Adapted from insert 2 in Lobry & Chessel (2003) JAG 44:235")

words() -> codons
unlist(lapply(lapply(codons,s2c),translate, numcode = numcode)) -> aa
aaa(aa) -> aa3
#
# sort by alphabetical order of three-letter code of amino-acids
#

neworder <- order(aa3)
aa3 <- aa3[neworder]
aa <- aa[neworder]
codons <- codons[neworder]
#
# Text for codons
#
cangles <- seq(0, 2*pi, le = 65)[1:64]
text(x = sin(cangles)*0.9, y = cos(cangles)*0.9, labels = codons, cex = 0.65)
#
# Text for aa3
#
aangles <- seq(0, 2*pi, le = 22)[1:21]
text(x = sin(aangles)*0.35, y = cos(aangles)*0.35, 
     labels = unique(aa3), cex = 0.8)
#
# Text for aa
#
text(x = sin(aangles)*0.25, y = cos(aangles)*0.25, 
     labels = unique(aa), cex = 0.8)
#
# Draw lines
#
for( i in 1:64 )
{
  target <- aaa(translate(s2c(codons[i]), numcode = numcode))
  n <- which( unique(aa3) == target)
  lines(x = c(sin(cangles[i])*0.85, sin(aangles[n])*0.4), 
        y = c(cos(cangles[i])*0.85, cos(aangles[n])*0.4) )
}

Figure 1. Eigenvalue graphs for internal correspondence analysis and associated analyses of codon usage in 58,482 bacterial coding sequences from 25 complete genomes. The eigenvalue for a given factor is proportional to the variance in the table under analysis that is accounted for by that factor. Conceptually assigned factors were colorized as follows: black for the G+C content effect, dark gray for temperature effect, and light gray for the sub cellular location effect. The relative contribution of one factor within an analysis is also indicated in percentage for assigned factors. All the graphs are at the same scale (only the first 10 eigenvalues were represented) to allow for a direct visual comparison.

#
# Figure 1 (Eigen value graphes)
#
ftpPath <- "/ftp/ftpdir/pub/datasets/JAG2003"
load( paste(ftpPath, "eigens.RData", sep = "/" ))

attach(eigens)
op <- par( no.readonly = TRUE )
par(mfrow=c(3,3))
par(cex=1.0)
par(cex.main=1.0)
par(mai=c(0.03,0.03,0.03,0.03))
par(bg="white")

bp <- barplot(ww[1:10],bty="o",
  col = c("black",rep("white",length(ww)) ),
  yaxt="n",
  ylim=c(0,max(frasort.coa)))
text(bp[5],y=max(frasort.coa)/2,"WITHIN AA\nWITHIN SPECIES",font=2)
text(0.5*bp[1],y=2.0*ww[1],paste(round(100*ww[1]/sum(ww),1),"% (G+C)"),xpd=TRUE,pos=4)
box(bty="o")

bp <- barplot(wb[1:10],
  col = c("lightgrey",rep("white",length(wb)) ),
  yaxt="n",
  ylim=c(0,max(frasort.coa)))
text(0.5*bp[1],y=2*wb[1],paste(round(100*wb[1]/sum(wb),1),"% (subcellular location)"),xpd=TRUE,pos=4)
text(bp[5],y=max(frasort.coa)/2,"BETWEEN AA\nWITHIN SPECIES",font=2)
box(bty="o")

bp <- barplot(wt[1:10],
  col = c("lightgrey","black",rep("white",length(wt)) ),
  yaxt="n",
  ylim=c(0,max(frasort.coa)))  
text(0.5*bp[1],y=2.5*wt[1],paste(round(100*wt[1]/sum(wt),1),"% (subcellular location)"),xpd=TRUE,pos=4)
text(bp[2],y=2*wt[2],paste(round(100*wt[2]/sum(wt),1),"% (G+C)"),xpd=TRUE,pos=4)
text(bp[5],y=max(frasort.coa)/2,"\nWITHIN SPECIES",font=2)
box()

bp <- barplot(bw[1:10],
  col = c("black","darkgrey",rep("white",length(bw)) ),
  yaxt="n",
  ylim=c(0,max(frasort.coa)))
text(0.5*bp[1],y=1.1*bw[1],paste(round(100*bw[1]/sum(bw),1),"% (G+C)"),xpd=TRUE,pos=4)
text(bp[2],y=1.5*bw[2],paste(round(100*bw[2]/sum(bw),1),"% (Temperature)"),xpd=TRUE,pos=4)
text(bp[5],y=max(frasort.coa)/2,"WITHIN AA\nBETWEEN SPECIES",font=2)
box()

bp <- barplot(bb.coa[1:10],
  col = c("black","darkgrey",rep("white",length(bw)) ),
  yaxt="n",
  ylim=c(0,max(frasort.coa)))
text(0.5*bp[1],y=1.2*bb.coa[1],paste(round(100*bb.coa[1]/sum(bb.coa),1),"% (G+C)"),xpd=TRUE,pos=4)
text(bp[2],y=3*bb.coa[2],paste(round(100*bb.coa[2]/sum(bb.coa),1),"% (Temperature)"),xpd=TRUE,pos=4)
text(bp[5],y=max(frasort.coa)/2,"BETWEEN AA\nBETWEEN SPECIES",font=2)
box()

bp <- barplot(bt.coa[1:10],
  col = c("black","darkgrey",rep("white",length(bt.coa)) ),
  yaxt="n",
  ylim=c(0,max(frasort.coa)))
text(1.5*bp[1],y=0.9*bt.coa[1],paste(round(100*bt.coa[1]/sum(bt.coa),1),"% (G+C)"),xpd=TRUE,pos=4)
text(1.5*bp[1],y=1.2*bt.coa[2],paste(round(100*bt.coa[2]/sum(bt.coa),1),"% (Temperature)"),xpd=TRUE,pos=4)
text(bp[5],y=max(frasort.coa)/2,"\nBETWEEN SPECIES",font=2)
box()

bp <- barplot(tw[1:10],
  col = c("black","darkgrey",rep("white",length(tw)) ),
  yaxt="n",
  ylim=c(0,max(frasort.coa)))
text(1.5*bp[1],y=tw[1],paste(round(100*tw[1]/sum(tw),1),"% (G+C)"),xpd=TRUE,pos=4)
text(1.5*bp[2],y=0.95*tw[2],paste(round(100*tw[2]/sum(tw),1),"% (Temperature)"),xpd=TRUE,pos=4)
text(bp[5],y=max(frasort.coa)/2,"WITHIN AA",font=2)
box()

bp <- barplot(tb.coa[1:10],
  col = c("black","lightgrey",rep("white",length(tb.coa)) ),
  yaxt="n",
  ylim=c(0,max(frasort.coa)))
text(0.5*bp[1],y=1.2*tb.coa[1],paste(round(100*tb.coa[1]/sum(tb.coa),1),"% (G+C)"),xpd=TRUE,pos=4)
text(0.7*bp[2],y=1.5*tb.coa[2],paste(round(100*tb.coa[2]/sum(tb.coa),1),"% (subcellular location)"),xpd=TRUE,pos=4)
text(bp[5],y=max(frasort.coa)/2,"BETWEEN AA",font=2)
box()

bp <- barplot(frasort.coa[1:10],
  col = c("black","darkgrey",rep("white",length(frasort.coa)) ),
  yaxt="n",
  space=0.2,
  ylim=c(0,max(frasort.coa)))
text(2.0*bp[1],y=0.9*frasort.coa[1],paste(round(100*frasort.coa[1]/sum(frasort.coa),1),"% (G+C)"),xpd=TRUE,pos=4)
text(1.2*bp[2],y=frasort.coa[2],paste(round(100*frasort.coa[2]/sum(frasort.coa),1),"% (Temperature)"),xpd=TRUE,pos=4)
text(bp[5],y=max(frasort.coa)/2,"GLOBAL",font=2)
box()
par(op)

Figure 2. First factorial map for global analysis of codon usage in 58,482 coding sequences. Points corresponding to coding sequences are too numerous to plot, they have been summarized on a species basis by ellipses containing 95% of them. Low G+C species are on the left, high G+C on the right. Thermophilic species are on the top, mesophilic on the bottom. Species names were abbreviated as follows: Aeropyrum pernix (AERP), Aquifex Aeolicus (AQUA), Archaeoglobus fulgidus (ARCF), Bacillus halodurans (BACH), Campylobacter jejuni (CAMJ), Deinococcus radiodurans (DEIR), Escherichia coli K12 (ESCK), Halobacterium sp. NRC-1 (HASN), Methanococcus jannaschii (METJ), Methanopyrum kandleri AV19 (MEKA), Methanothermobacter thermoautotrophicus (METT), Pseudomonas aeruginosa (PSEA), Pyrobaculum aerophilum (PYROBA), Pyrococcus abyssi (PYROCA), Pyrococcus furiosus DSM 3638 (PFD3), Pyrococcus horikoshii (PYRH), Rickettsia prowazekii (RICP), Staphylococcus aureus subsp. aureus MU50 (SASAM), Streptomyces coelicolor A3(2) (STCA), Sulfolobus solfataricus (SULS), Sulfolobus tokodaii (SULT), Thermoanaerobacter tengcongensis (THET), Thermoplasma acidophilum (THEA), Thermoplasma volcanium (THEV), Thermotoga maritima (THEM).

Figure 3. Codon positions on the first factorial map for global analysis of codon usage in 58,482 coding sequences. AT rich codons are on the left, GC rich codons on the right. Codons preferred in thermophilic bacteria on the top, codons avoided in thermophilic bacteria on the bottom.

Figure 4. Bimodal distribution of 58,482 protein scores on the first factor of between-amino acid and within-species variability. The line represents the maximum-likelihood estimate for a mixture of two normal distributions.

#
# Figure 4: bimodal distribution
#
ftpPath <- "/ftp/ftpdir/pub/datasets/JAG2003"
load( paste(ftpPath, "wb.RData", sep = "/" ))

logvraineg <- function(x)
{
  -sum( log( x[1]*dnorm(z,x[2],x[3])+(1-x[1])*dnorm(z,x[4],x[5])))
}

z <- -1*wb$li[,1]
guess <- c(0.13, -0.3, 0.08, 0.05, 0.09)

nlm0 <- nlm(logvraineg, guess)

hist(-1*wb$li[,1], nclass = 100, col = "grey",
    main="Distribution of protein scores on the first factor of Between-\naminoacids Within-species variability", ylab="protein frequency",
    xlab = "Coordinate on first factor",proba=T)

w0 <- seq(min(z),max(z),le=100)
q <- nlm0$estimate
lines(w0, q[1]*dnorm(w0,q[2],q[3]))
lines(w0, (1-q[1])*dnorm(w0,q[4],q[5]))

text(-0.75, 3.8, paste("n =", nrow(wb$tab), "proteins"), pos = 4)

text(-0.75, 1.7, pos = 4, 
     paste("Integral Membrane Proteins\n", round(100*q[1],2), "%", sep = ""))
text(-0.75, 1.48, pos = 4, 
     bquote(mu == .(round(q[2],3))))
text(-0.75, 1.36, pos = 4, 
     paste("s = ", round(q[3],3), sep = ""))

text(0.25, 1.7, pos = 4, 
     paste("Cytoplasmic Proteins\n", round(100*(1-q[1]),2), "%", sep = ""))
text(0.25, 1.48, pos = 4, 
     bquote(mu == .(round(q[4],3))))
text(0.25, 1.36, pos = 4, 
     paste("s = ", round(q[5],3), sep = ""))

Figure 5a. Comparison of synonymous (x-axis) and non-synonymous (y-axis) codon usage in 25 bacterial species. Each species is represented by a point with coordinates on the first factor of codon usage analyses. If there was no selection on the average amino-acid content of proteins points should lie on the y = x line, if there was an absolute selection on the average amino acid content points should lie on the horizontal axis, observed slopes values (from orthogonal regression) are intermediate between these two extreme theoretical situations. Note that since the first factor of variability is the G+C content, this is very similar to Sueoka's neutrality plot : the synonymous coordinate for G+C is close to P3 and the non-synonymous coordinate to P12 (cf SUEOKA 1988).

Figure 5b. Comparison of synonymous (x-axis) and non-synonymous (y-axis) codon usage in 25 bacterial species. Each species is represented by a point with coordinates on the second factor of codon usage analyses. If there was no selection on the average amino-acid content of proteins points should lie on the y = x line, if there was an absolute selection on the average amino acid content points should lie on the horizontal axis, observed slopes values (from orthogonal regression) are intermediate between these two extreme theoretical situations.

Figure 6. First factorial map for synonymous codon usage in 293 bacteria. Filled points correspond to thermophilic species and open points to mesophilic species. The first factor is the G+C content with low G+C species on the left and high G+C species on the right, the second factor is linked to thermophily with almost all thermophilic species on the top. Some outliers of interest for the discussion are outlined.

#
# Figure 6
#
ftpPath <- "/ftp/ftpdir/pub/datasets/JAG2003"
load( paste(ftpPath, "tw293.RData", sep = "/" ))
s <- read.table(paste(ftpPath, "Topt.txt", sep = "/"), h = TRUE, sep = "\t")

tw293$li <- tw293$li[ order(rownames(tw293$li)), ]

seuil <- 59
genre <- "Clostridium"

plot(tw293$li[s$Topt<seuil,1],tw293$li[s$Topt<seuil,2],asp=1,
  xlab="First Factor (68.3%)",
  ylab="Second Factor (11.0%)",pch=1,
  xlim=range(tw293$li[,1]))

points(tw293$li[s$Topt>=seuil,1],tw293$li[s$Topt>=seuil,2],
  pch=19,cex=1.5)
text(0.0,-0.25,"Geobacillus stearothermophilus",pos=4,cex=0.75)
text(-0.25,0.60,"Eubacterium\nacidaminophilum",pos=2,cex=0.75)

points(tw293$li[s$Genus==genre,1],tw293$li[s$Genus==genre,2],
  pch=3,col="black",cex=2.)
points(tw293$li[s$Genus=="Eubacterium",1],tw293$li[s$Genus=="Eubacterium",2],
  pch=3,col="black",cex=2.)
points(tw293$li[s$Genus=="Methanosarcina",1],tw293$li[s$Genus=="Methanosarcina",2],
  pch=4,col="black",cex=4.)

legend(-0.9,-0.4,c(paste("Topt >= ",seuil),paste("Topt <  ",seuil)),
pch=c(19,1))
  
legend(-0.,-0.4,c("Clostridium","Methanosarcina"),
pch=c(3,4),col=c("black","black"))

#
# Bonus, not in paper: color version of Figure 6
#
ftpPath <- "/ftp/ftpdir/pub/datasets/JAG2003"
load( paste(ftpPath, "tw293.RData", sep = "/" ))
s <- read.table(paste(ftpPath, "Topt.txt", sep = "/"), h = TRUE, sep = "\t")

tw293$li <- tw293$li[ order(rownames(tw293$li)), ]

seuil <- 59
genre <- "Clostridium"

plot(tw293$li[s$Topt<seuil,1],tw293$li[s$Topt<seuil,2],asp=1,
  xlab="First Factor (68.3%)",
  ylab="Second Factor (11.0%)",
  xlim=range(tw293$li[,1]),
  col = "blue", pch = 19,
  main = "Color version of figure 6 from\nLobry & Chessel (2003) JAG 44:235")

points(tw293$li[s$Topt>=seuil,1],tw293$li[s$Topt>=seuil,2],
  col = "red", pch = 19)
text(0.0,-0.25,"Geobacillus stearothermophilus",pos=4,cex=0.75, col = "red")
text(-0.25,0.60,"Eubacterium\nacidaminophilum",pos=2,cex=0.75, col = "blue")

legend(-0.9,-0.4,c(paste("Topt >= ",seuil),paste("Topt <  ",seuil)),
pch=c(19,19), col = c("red","blue"))
  
legend(-0.,-0.4,c("Clostridium","Methanosarcina"),
pch=c(3,4),col=c("black","black"))

Figure 7a. Same as figure 5a but for the partial genome dataset with 293 bacteria.

Figure 7b. Same as figure 5b but for the partial genome dataset with 293 bacteria.

If you have any problems or comments, please contact Jean Lobry.