Introduction

Shoji Taniguchi

0. Introduction to gpyramid package

R package gpyramid has been designed for gene pyrammiding in plant breeding.

The gene pyramidding was formulated by Servin et al. (2004) <DOI: 10.1534/genetics.103.023358>.

This document describes how to conduct the same calculation as Servin et al. (2004) in the R environment.

1. Set up

library(gpyramid)
library(ape)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following object is masked from 'package:ape':
#> 
#>     where
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

2. Prepare data

2.1 Gene data

line_df <- data.frame(line = c("x1", "x2", "x3", "x4"),
                      gene1 = c("A", "B", "B", "B"),
                      gene2 = c("B", "A", "B", "B"),
                      gene3 = c("B", "B", "A", "B"),
                      gene4 = c("B", "B", "B", "A"))

line_df
#>   line gene1 gene2 gene3 gene4
#> 1   x1     A     B     B     B
#> 2   x2     B     A     B     B
#> 3   x3     B     B     A     B
#> 4   x4     B     B     B     A

2.2 Position data

position_df <- data.frame(Gene = c("g1", "g2", "g3", "g4"),
                          Chr = c("1A", "1A", "1A", "1A"),
                          cM = c(0, 20, 40, 60))
position_df
#>   Gene Chr cM
#> 1   g1  1A  0
#> 2   g2  1A 20
#> 3   g3  1A 40
#> 4   g4  1A 60

2.3 Preprosessing

Generate haplotype dataframe from row data

gene_dat <- util_haplo(line_df, target = "A", non_target = "B", hetero = "H", line_cul = "line")

gene_df1 <- gene_dat[[1]]
gene_df2 <- gene_dat[[2]]
line_id <- gene_dat[[3]]

colnames(gene_df1) <- line_id
colnames(gene_df2) <- line_id

gene_df1
#>      x1 x2 x3 x4
#> [1,]  1  0  0  0
#> [2,]  0  1  0  0
#> [3,]  0  0  1  0
#> [4,]  0  0  0  1
gene_df2
#>      x1 x2 x3 x4
#> [1,]  1  0  0  0
#> [2,]  0  1  0  0
#> [3,]  0  0  1  0
#> [4,]  0  0  0  1

Generate recombination probability matrix from raw data

recom_mat <- util_recom_mat(position_df, "cM")
recom_mat
#>           [,1]      [,2]      [,3]      [,4]
#> [1,] 0.0000000 0.1648400 0.2753355 0.3494029
#> [2,] 0.1648400 0.0000000 0.1648400 0.2753355
#> [3,] 0.2753355 0.1648400 0.0000000 0.1648400
#> [4,] 0.3494029 0.2753355 0.1648400 0.0000000

3. Find parent sets from candidate lines (cultivars)

Fron candidate lines, findPset function returns the parent sets for gene pyramidding.

In this example, only one parent set was returned.

line_comb_lis <- findPset(gene_df1, gene_df2, line_id)
line_comb_lis
#> [[1]]
#> [1] "x1" "x2" "x3" "x4"

4. Calculate the number of necessary individuals and generations

calcCostAll function calculates the number of necessary individuals and generations as the crossing cost for all the crossing schemes.

Given parent sets for gene pyramidding, calCostAll function simulates all the crossing schemes and calculates the number of necessary individuals and generations as the cost of gene pyramidding.

calcCostAll function returns the gpyramid_all object, which contains information of all the crossing schemes.

Here, getFromAll function get one crossing scheme from gpyramid_all object.

rslt <- calcCostAll(line_comb_lis, gene_df1, gene_df2, recom_mat, 
                    prob_total = 0.99, last_cross = T, last_selfing = T)
rslt$cost_all
#> # A tibble: 15 × 6
#>    cross_id n_generation N_total n_parent line_comb_id TR_id
#>       <dbl>        <dbl>   <dbl>    <dbl>        <dbl> <dbl>
#>  1        1            3     887        4            1     1
#>  2        2            2    2908        4            1     2
#>  3        3            3     714        4            1     3
#>  4        4            3     733        4            1     4
#>  5        5            3     650        4            1     5
#>  6        6            2     961        4            1     6
#>  7        7            3     650        4            1     7
#>  8        8            3     325        4            1     8
#>  9        9            3     325        4            1     9
#> 10       10            3     325        4            1    10
#> 11       11            3     887        4            1    11
#> 12       12            3     733        4            1    12
#> 13       13            2    1001        4            1    13
#> 14       14            3     714        4            1    14
#> 15       15            3     325        4            1    15

4.1 Fig 4a (Servin et al., 2004)

Fig 4a in Servin et al. (2004) corresponds to cross_id = 15 in the above gpyramid_all object.

rslt_one <- getFromAll(rslt, cross_id = 15)
summary(rslt_one)
#> Number of necessary plants for each crossing
#> 
#>         nodeid n_plant
#> 1       node_6       1
#> 2       node_7      70
#> 3       node_5      84
#> 4   last cross     102
#> 5 last selfing      68
#> 
#> Target haplotype of crossing
#> 
#> nodeid=5
#>   hap1 hap2
#> 1    0    1
#> 2    0    1
#> 3    0    1
#> 4    1    0
#> 
#> nodeid=6
#>   hap1 hap2
#> 1    1    0
#> 2    0    1
#> 3    0    0
#> 4    0    0
#> 
#> nodeid=7
#>   hap1 hap2
#> 1    0    1
#> 2    0    1
#> 3    1    0
#> 4    0    0
plot(rslt_one$topolo)
nodelabels()

4.2 Fig 4b (Servin et al., 2004)

Fig 4b in Servin et al. (2004) corresponds to cross_id = 6 in the above gpyramid_all object.

rslt_one <- getFromAll(rslt, cross_id = 6)
summary(rslt_one)
#> Number of necessary plants for each crossing
#> 
#>         nodeid n_plant
#> 1       node_7       1
#> 2       node_6       1
#> 3       node_5     394
#> 4   last cross     500
#> 5 last selfing      65
#> 
#> Target haplotype of crossing
#> 
#> nodeid=5
#>   hap1 hap2
#> 1    0    1
#> 2    1    0
#> 3    1    0
#> 4    0    1
#> 
#> nodeid=6
#>   hap1 hap2
#> 1    0    0
#> 2    1    0
#> 3    0    1
#> 4    0    0
#> 
#> nodeid=7
#>   hap1 hap2
#> 1    1    0
#> 2    0    0
#> 3    0    0
#> 4    0    1
plot(rslt_one$topolo)
nodelabels()

4.3 Fig 4c (Servin et al., 2004)

Fig 4c in Servin et al. (2004) corresponds to cross_id = 13 in the above gpyramid_all object.

rslt_one <- getFromAll(rslt, cross_id = 13)
summary(rslt_one)
#> Number of necessary plants for each crossing
#> 
#>         nodeid n_plant
#> 1       node_7       1
#> 2       node_6       1
#> 3       node_5     837
#> 4   last cross      97
#> 5 last selfing      65
#> 
#> Target haplotype of crossing
#> 
#> nodeid=5
#>   hap1 hap2
#> 1    1    0
#> 2    1    0
#> 3    0    1
#> 4    0    1
#> 
#> nodeid=6
#>   hap1 hap2
#> 1    1    0
#> 2    0    1
#> 3    0    0
#> 4    0    0
#> 
#> nodeid=7
#>   hap1 hap2
#> 1    0    0
#> 2    0    0
#> 3    1    0
#> 4    0    1
plot(rslt_one$topolo)
nodelabels()