library(alndist)

We use aligned mouse olfactory receptors as example. There are about 1000 of them. Aligned fasta file is provided as extdata in this package. Load these sequences.

or <- seqinr::read.fasta(system.file("extdata/mouseOR.fasta", package="alndist"), seqtype="AA")

Get two sequences.

x <- as.character(or[[1]])
y <- as.character(or[[2]])
x # case doesn't matter
#>   [1] "l" "e" "-" "-" "-" "-" "-" "f" "v" "l" "v" "g" "f" "-" "r" "l" "v"
#>  [18] "h" "l" "q" "g" "i" "l" "f" "s" "l" "f" "l" "t" "v" "y" "l" "l" "t"
#>  [35] "v" "a" "g" "n" "l" "l" "i" "v" "a" "l" "v" "s" "t" "d" "a" "a" "l"
#>  [52] "q" "s" "p" "m" "y" "f" "f" "l" "r" "i" "l" "s" "a" "l" "e" "i" "c"
#>  [69] "y" "t" "s" "v" "t" "v" "p" "l" "l" "l" "h" "h" "l" "l" "t" "g" "r"
#>  [86] "r" "h" "i" "s" "r" "s" "g" "c" "a" "l" "q" "m" "f" "f" "f" "l" "f"
#> [103] "f" "g" "a" "t" "e" "c" "c" "l" "l" "a" "a" "m" "a" "y" "d" "r" "y"
#> [120] "a" "a" "i" "c" "e" "p" "l" "r" "y" "q" "v" "l" "l" "s" "r" "r" "v"
#> [137] "c" "v" "q" "l" "a" "g" "a" "a" "w" "s" "c" "g" "a" "l" "v" "g" "l"
#> [154] "g" "h" "t" "s" "f" "i" "f" "s" "l" "p" "f" "c" "g" "p" "n" "a" "v"
#> [171] "p" "h" "f" "f" "c" "e" "i" "q" "p" "v" "l" "q" "l" "v" "c" "g" "d"
#> [188] "t" "s" "l" "n" "e" "l" "q" "i" "l" "a" "a" "a" "l" "i" "i" "l" "c"
#> [205] "-" "p" "f" "g" "l" "i" "l" "s" "s" "y" "g" "r" "i" "l" "v" "t" "i"
#> [222] "f" "r" "i" "p" "s" "a" "a" "g" "r" "r" "k" "a" "f" "s" "t" "c" "s"
#> [239] "s" "h" "l" "v" "v" "v" "s" "l" "f" "y" "g" "t" "a" "i" "f" "i" "y"
#> [256] "i" "r" "p" "k" "a" "s" "p" "t" "t" "d" "p" "l" "l" "s" "l" "f" "y"
#> [273] "a" "v" "i" "t" "p" "i" "l" "n" "p" "v" "i" "y" "s" "l" "r" "n" "a"
#> [290] "d" "v" "k" "a" "a" "l" "k" "r" "s" "-" "i" "q" "k"

Get the substitution matrix.

mtx <- get_substitute_mtx("PAM250")

Calculate weighted distance between the two sequences.

weights <- rep(1, length(x)) # ideally should reflect your belief on the importance of each residue, all 1 here
two_seq_score(x, y, mtx, weights)
#> [1] 685

The hope is to be able to calculate pairwise distances of larger number of sequences. Pairwise distances for all mouse olfactory receptors:

lst <- purrr::map(or, ~as.character(.x))
names(lst) <- purrr::map(or, ~attr(.x, "Annot"))
dist_mtx <- pairwise_score(lst, mtx, weights)

This will actually lead to some clustering of receptors (e.g. compare with Figure 2 here http://stke.sciencemag.org/content/2/60/ra9)

dist_pca <- prcomp(dist_mtx, scale. = TRUE)
ggplot2::ggplot(as.data.frame(dist_pca$x), ggplot2::aes(PC1, PC2)) +
  ggplot2::geom_point(alpha=0.3)

We can also try ~10k sequences, the time should still be reasonable. There’s also a parallel_pairwise_score to be used when there are multiple cores available. On my 2015 Macbook Pro they took ~ 1 minute and ~ 30 seconds respectively.

long_lst <- rep(lst, 10)
names(long_lst) <- 1:length(long_lst)
long_weights <- rep(1, length(long_lst))
system.time(dist_mtx <- pairwise_score(long_lst, mtx, weights))
#>    user  system elapsed 
#>  76.132   0.429  76.818
system.time(dist_mtx_p <- parallel_pairwise_score(long_lst, mtx, weights))
#>    user  system elapsed 
#> 213.714   2.593  35.685
identical(dist_mtx, dist_mtx_p)
#> [1] TRUE

You can further benchmark if interested. I’m not running this part.

microbenchmark::microbenchmark(dist_mtx <- pairwise_score(lst, mtx, weights))
microbenchmark::microbenchmark(dist_mtx_p <- parallel_pairwise_score(lst, mtx, weights))