Filter a comparison to include only bidirectional best hits
bidirectional_best_hit.Rd
Takes a comparison
object and filters it to include only
bidirectional best hits, with the use of a second comparison
,
provided by other_direction
. Both comparison
objects must be
filtered for best hits already (see best_hit).
Usage
bidirectional_best_hit(
comparison,
other_direction,
group_by1 = "name1",
group_by2 = "name2"
)
Arguments
- comparison
A
comparison
object to filter.- other_direction
A
comparison
object that thecomparison
to filter is compared with.- group_by1
A character string referring to a column in
comparison
that holds its query names.- group_by2
A character string referring to a column in
other_direction
that holds its query names.
Details
The best hits from the first
comparison
are only kept when their query-subject combinations can also
be found in the best hits from the second comparison
. For example, take
a best hit in the first comparison
with the query name "geneA"
and
subject name "geneB"
. In the comparison
provided by
other_direction
, the best hit for the query name "geneB"
has to be the
subject name "geneA"
. Only then is the hit kept as a bidirectional best
hit. The query names are provided by group_by1
and group_by2
, for the
comparison
to filter and the other comparison
respectively.
Examples
## Read example blastp results
infile1 <- system.file('extdata/blastp_example1.tab', package = 'genoPlotR')
## comparison before filtering for best hits
blast_comparison1 <- read_comparison_from_blast(infile1)
print_comparison(blast_comparison1)
#> start1 end1 start2 end2 name1 name2 per_id aln_len
#> <int> <int> <int> <int> <char> <char> <num> <int>
#> 1: 117 502 1 385 HKIDPCCJ_00001 LABPAKCO_00001 58.7 387
#> 2: 1 376 1 379 HKIDPCCJ_00002 LABPAKCO_00002 56.5 379
#> 3: 19 651 12 645 HKIDPCCJ_00004 LABPAKCO_00005 74.6 634
#> 4: 23 655 8 644 HKIDPCCJ_00004 LABPAKCO_01957 51.4 642
#> 5: 11 811 10 812 HKIDPCCJ_00005 LABPAKCO_00006 62.8 803
#> 6: 11 810 6 809 HKIDPCCJ_00005 LABPAKCO_01958 40.0 817
#> 7: 1 172 1 190 HKIDPCCJ_00007 LABPAKCO_00009 55.8 197
#> 8: 1 172 1 164 HKIDPCCJ_00007 LABPAKCO_02342 50.0 174
#> 9: 93 535 67 509 HKIDPCCJ_00029 LABPAKCO_00168 28.7 453
#> 10: 87 521 107 546 HKIDPCCJ_00029 LABPAKCO_00977 26.3 452
#> 11: 328 529 335 539 HKIDPCCJ_00029 LABPAKCO_01459 33.2 205
#> 12: 88 534 100 548 HKIDPCCJ_00029 LABPAKCO_01460 23.7 451
#> 13: 304 498 312 502 HKIDPCCJ_00029 LABPAKCO_02399 31.3 201
#> 14: 328 521 28 227 HKIDPCCJ_00029 LABPAKCO_02526 29.8 208
#> 15: 328 519 4 214 HKIDPCCJ_00029 LABPAKCO_01272 29.1 213
#> 16: 1 637 1 644 HKIDPCCJ_00034 LABPAKCO_00103 56.1 645
#> 17: 2 210 11 270 HKIDPCCJ_00034 LABPAKCO_01390 31.5 260
#> 18: 8 303 12 290 HKIDPCCJ_00034 LABPAKCO_01699 30.1 306
#> 19: 100 531 78 511 HKIDPCCJ_00038 LABPAKCO_00168 24.1 456
#> 20: 250 513 267 544 HKIDPCCJ_00038 LABPAKCO_00977 26.1 284
#> 21: 267 513 290 533 HKIDPCCJ_00038 LABPAKCO_01460 27.6 254
#> 22: 325 503 331 513 HKIDPCCJ_00038 LABPAKCO_02399 28.1 185
#> 23: 109 473 105 460 HKIDPCCJ_00038 LABPAKCO_01612 22.5 374
#> 24: 359 501 368 516 HKIDPCCJ_00038 LABPAKCO_01459 29.6 152
#> 25: 320 514 22 226 HKIDPCCJ_00038 LABPAKCO_02526 25.8 209
#> 26: 323 485 2 187 HKIDPCCJ_00038 LABPAKCO_01272 22.1 204
#> 27: 451 514 331 398 HKIDPCCJ_00038 LABPAKCO_00394 32.4 68
#> 28: 8 338 4 334 HKIDPCCJ_00047 LABPAKCO_00854 54.1 333
#> 29: 4 336 1 333 HKIDPCCJ_00047 LABPAKCO_02361 27.1 336
#> 30: 1 442 1 443 HKIDPCCJ_00048 LABPAKCO_01328 33.8 444
#> 31: 1 438 1 440 HKIDPCCJ_00048 LABPAKCO_02752 32.4 442
#> 32: 1 379 1 378 HKIDPCCJ_00049 LABPAKCO_00839 72.5 382
#> 33: 15 348 9 310 HKIDPCCJ_00049 LABPAKCO_00753 32.2 342
#> start1 end1 start2 end2 name1 name2 per_id aln_len
#> mism gaps e_value bit_score direction
#> <int> <int> <num> <num> <num>
#> 1: 157 2 1.32e-153 440.0 1
#> 2: 162 1 1.18e-140 401.0 1
#> 3: 160 1 0.00e+00 946.0 1
#> 4: 298 11 9.19e-211 601.0 1
#> 5: 297 1 0.00e+00 989.0 1
#> 6: 460 7 4.55e-185 547.0 1
#> 7: 55 2 2.90e-56 172.0 1
#> 8: 75 2 8.61e-50 154.0 1
#> 9: 303 10 3.22e-49 174.0 1
#> 10: 304 13 2.30e-31 124.0 1
#> 11: 134 3 3.10e-30 120.0 1
#> 12: 338 6 4.45e-29 117.0 1
#> 13: 122 4 2.08e-22 97.1 1
#> 14: 124 6 2.37e-14 69.7 1
#> 15: 128 7 3.15e-14 71.2 1
#> 16: 274 5 7.15e-256 715.0 1
#> 17: 127 4 8.69e-31 124.0 1
#> 18: 177 10 5.54e-24 102.0 1
#> 19: 300 9 3.02e-28 114.0 1
#> 20: 184 8 3.67e-21 93.2 1
#> 21: 167 6 5.92e-19 86.3 1
#> 22: 125 2 9.99e-18 82.4 1
#> 23: 263 7 2.73e-17 80.9 1
#> 24: 95 3 1.32e-12 66.2 1
#> 25: 137 6 9.21e-11 58.9 1
#> 26: 100 7 6.56e-05 41.6 1
#> 27: 42 2 2.91e-04 39.7 1
#> 28: 149 3 6.03e-110 322.0 1
#> 29: 239 4 5.43e-29 111.0 1
#> 30: 291 3 2.02e-80 252.0 1
#> 31: 293 4 7.28e-74 235.0 1
#> 32: 98 3 5.42e-194 537.0 1
#> 33: 184 11 4.93e-43 149.0 1
#> mism gaps e_value bit_score direction
## Filter for best hits and print results
bh_comparison1 <- best_hit(blast_comparison1)
print_comparison(bh_comparison1)
#> start1 end1 start2 end2 name1 name2 per_id aln_len
#> <int> <int> <int> <int> <char> <char> <num> <int>
#> 1: 117 502 1 385 HKIDPCCJ_00001 LABPAKCO_00001 58.7 387
#> 2: 1 376 1 379 HKIDPCCJ_00002 LABPAKCO_00002 56.5 379
#> 3: 19 651 12 645 HKIDPCCJ_00004 LABPAKCO_00005 74.6 634
#> 4: 11 811 10 812 HKIDPCCJ_00005 LABPAKCO_00006 62.8 803
#> 5: 1 172 1 190 HKIDPCCJ_00007 LABPAKCO_00009 55.8 197
#> 6: 93 535 67 509 HKIDPCCJ_00029 LABPAKCO_00168 28.7 453
#> 7: 1 637 1 644 HKIDPCCJ_00034 LABPAKCO_00103 56.1 645
#> 8: 100 531 78 511 HKIDPCCJ_00038 LABPAKCO_00168 24.1 456
#> 9: 8 338 4 334 HKIDPCCJ_00047 LABPAKCO_00854 54.1 333
#> 10: 1 442 1 443 HKIDPCCJ_00048 LABPAKCO_01328 33.8 444
#> 11: 1 379 1 378 HKIDPCCJ_00049 LABPAKCO_00839 72.5 382
#> mism gaps e_value bit_score direction
#> <int> <int> <num> <num> <num>
#> 1: 157 2 1.32e-153 440 1
#> 2: 162 1 1.18e-140 401 1
#> 3: 160 1 0.00e+00 946 1
#> 4: 297 1 0.00e+00 989 1
#> 5: 55 2 2.90e-56 172 1
#> 6: 303 10 3.22e-49 174 1
#> 7: 274 5 7.15e-256 715 1
#> 8: 300 9 3.02e-28 114 1
#> 9: 149 3 6.03e-110 322 1
#> 10: 291 3 2.02e-80 252 1
#> 11: 98 3 5.42e-194 537 1
## Repeat steps BLAST results in the other direction
infile2 <- system.file('extdata/blastp_example2.tab', package = 'genoPlotR')
blast_comparison2 <- read_comparison_from_blast(infile2)
bh_comparison2 <- best_hit(blast_comparison2)
## Filter for bidirectional best hits and print results
bbh_comparison1 <- bidirectional_best_hit(comparison = bh_comparison1,
other_direction = bh_comparison2)
print_comparison(bbh_comparison1)
#> start1 end1 start2 end2 name1 name2 per_id aln_len mism
#> <int> <int> <int> <int> <char> <char> <num> <int> <int>
#> 1: 117 502 1 385 HKIDPCCJ_00001 LABPAKCO_00001 58.7 387 157
#> 2: 1 376 1 379 HKIDPCCJ_00002 LABPAKCO_00002 56.5 379 162
#> 3: 19 651 12 645 HKIDPCCJ_00004 LABPAKCO_00005 74.6 634 160
#> 4: 11 811 10 812 HKIDPCCJ_00005 LABPAKCO_00006 62.8 803 297
#> 5: 1 172 1 190 HKIDPCCJ_00007 LABPAKCO_00009 55.8 197 55
#> 6: 1 637 1 644 HKIDPCCJ_00034 LABPAKCO_00103 56.1 645 274
#> 7: 8 338 4 334 HKIDPCCJ_00047 LABPAKCO_00854 54.1 333 149
#> 8: 1 442 1 443 HKIDPCCJ_00048 LABPAKCO_01328 33.8 444 291
#> 9: 1 379 1 378 HKIDPCCJ_00049 LABPAKCO_00839 72.5 382 98
#> gaps e_value bit_score direction
#> <int> <num> <num> <num>
#> 1: 2 1.32e-153 440 1
#> 2: 1 1.18e-140 401 1
#> 3: 1 0.00e+00 946 1
#> 4: 1 0.00e+00 989 1
#> 5: 2 2.90e-56 172 1
#> 6: 5 7.15e-256 715 1
#> 7: 3 6.03e-110 322 1
#> 8: 3 2.02e-80 252 1
#> 9: 3 5.42e-194 537 1