Skip to contents

Takes a comparison object and filters it to include only the best hit for each query. The query names are provided by group_by, and the best hit is determined by sorting each set of queries based on a set of columns provided by sort_order.

Usage

best_hit(
  comparison,
  group_by = "name1",
  sort_order = c("bit_score", "aln_len", "per_id", "direction")
)

Arguments

comparison

A comparison object to filter.

group_by

A character string referring to a column in comparison that holds the query names.

sort_order

A character vector of column names that determine what the best hit is, in order of importance. The first column is used to determine the best hits per query, and each subsequent column is only used in case of ties.

Value

A comparison object.

Details

Designed to find the best hits per query from comparison objects generated from tabular BLAST or DIAMOND results. For each column from sort_order, the maximum (or minimum) value is taken for each unique query name from group_by, in the order provided by sort_order, until each query only 1 hit left. The maximum values are taken for each sort_order column, unless the column provided is one of: "mism", "gaps", "e_value", "name1", "name2", "start1", "start2", "end1", or "end2".

Author

Mike Puijk

Examples

## Read example blastp results
infile <- system.file('extdata/blastp_example1.tab', package = 'genoPlotR')

## comparison before filtering for best hits
blast_comparison <- read_comparison_from_blast(infile)
print_comparison(blast_comparison)
#>     start1  end1 start2  end2          name1          name2 per_id aln_len
#>      <int> <int>  <int> <int>         <char>         <char>  <num>   <int>
#>  1:    117   502      1   385 HKIDPCCJ_00001 LABPAKCO_00001   58.7     387
#>  2:      1   376      1   379 HKIDPCCJ_00002 LABPAKCO_00002   56.5     379
#>  3:     19   651     12   645 HKIDPCCJ_00004 LABPAKCO_00005   74.6     634
#>  4:     23   655      8   644 HKIDPCCJ_00004 LABPAKCO_01957   51.4     642
#>  5:     11   811     10   812 HKIDPCCJ_00005 LABPAKCO_00006   62.8     803
#>  6:     11   810      6   809 HKIDPCCJ_00005 LABPAKCO_01958   40.0     817
#>  7:      1   172      1   190 HKIDPCCJ_00007 LABPAKCO_00009   55.8     197
#>  8:      1   172      1   164 HKIDPCCJ_00007 LABPAKCO_02342   50.0     174
#>  9:     93   535     67   509 HKIDPCCJ_00029 LABPAKCO_00168   28.7     453
#> 10:     87   521    107   546 HKIDPCCJ_00029 LABPAKCO_00977   26.3     452
#> 11:    328   529    335   539 HKIDPCCJ_00029 LABPAKCO_01459   33.2     205
#> 12:     88   534    100   548 HKIDPCCJ_00029 LABPAKCO_01460   23.7     451
#> 13:    304   498    312   502 HKIDPCCJ_00029 LABPAKCO_02399   31.3     201
#> 14:    328   521     28   227 HKIDPCCJ_00029 LABPAKCO_02526   29.8     208
#> 15:    328   519      4   214 HKIDPCCJ_00029 LABPAKCO_01272   29.1     213
#> 16:      1   637      1   644 HKIDPCCJ_00034 LABPAKCO_00103   56.1     645
#> 17:      2   210     11   270 HKIDPCCJ_00034 LABPAKCO_01390   31.5     260
#> 18:      8   303     12   290 HKIDPCCJ_00034 LABPAKCO_01699   30.1     306
#> 19:    100   531     78   511 HKIDPCCJ_00038 LABPAKCO_00168   24.1     456
#> 20:    250   513    267   544 HKIDPCCJ_00038 LABPAKCO_00977   26.1     284
#> 21:    267   513    290   533 HKIDPCCJ_00038 LABPAKCO_01460   27.6     254
#> 22:    325   503    331   513 HKIDPCCJ_00038 LABPAKCO_02399   28.1     185
#> 23:    109   473    105   460 HKIDPCCJ_00038 LABPAKCO_01612   22.5     374
#> 24:    359   501    368   516 HKIDPCCJ_00038 LABPAKCO_01459   29.6     152
#> 25:    320   514     22   226 HKIDPCCJ_00038 LABPAKCO_02526   25.8     209
#> 26:    323   485      2   187 HKIDPCCJ_00038 LABPAKCO_01272   22.1     204
#> 27:    451   514    331   398 HKIDPCCJ_00038 LABPAKCO_00394   32.4      68
#> 28:      8   338      4   334 HKIDPCCJ_00047 LABPAKCO_00854   54.1     333
#> 29:      4   336      1   333 HKIDPCCJ_00047 LABPAKCO_02361   27.1     336
#> 30:      1   442      1   443 HKIDPCCJ_00048 LABPAKCO_01328   33.8     444
#> 31:      1   438      1   440 HKIDPCCJ_00048 LABPAKCO_02752   32.4     442
#> 32:      1   379      1   378 HKIDPCCJ_00049 LABPAKCO_00839   72.5     382
#> 33:     15   348      9   310 HKIDPCCJ_00049 LABPAKCO_00753   32.2     342
#>     start1  end1 start2  end2          name1          name2 per_id aln_len
#>      mism  gaps   e_value bit_score direction
#>     <int> <int>     <num>     <num>     <num>
#>  1:   157     2 1.32e-153     440.0         1
#>  2:   162     1 1.18e-140     401.0         1
#>  3:   160     1  0.00e+00     946.0         1
#>  4:   298    11 9.19e-211     601.0         1
#>  5:   297     1  0.00e+00     989.0         1
#>  6:   460     7 4.55e-185     547.0         1
#>  7:    55     2  2.90e-56     172.0         1
#>  8:    75     2  8.61e-50     154.0         1
#>  9:   303    10  3.22e-49     174.0         1
#> 10:   304    13  2.30e-31     124.0         1
#> 11:   134     3  3.10e-30     120.0         1
#> 12:   338     6  4.45e-29     117.0         1
#> 13:   122     4  2.08e-22      97.1         1
#> 14:   124     6  2.37e-14      69.7         1
#> 15:   128     7  3.15e-14      71.2         1
#> 16:   274     5 7.15e-256     715.0         1
#> 17:   127     4  8.69e-31     124.0         1
#> 18:   177    10  5.54e-24     102.0         1
#> 19:   300     9  3.02e-28     114.0         1
#> 20:   184     8  3.67e-21      93.2         1
#> 21:   167     6  5.92e-19      86.3         1
#> 22:   125     2  9.99e-18      82.4         1
#> 23:   263     7  2.73e-17      80.9         1
#> 24:    95     3  1.32e-12      66.2         1
#> 25:   137     6  9.21e-11      58.9         1
#> 26:   100     7  6.56e-05      41.6         1
#> 27:    42     2  2.91e-04      39.7         1
#> 28:   149     3 6.03e-110     322.0         1
#> 29:   239     4  5.43e-29     111.0         1
#> 30:   291     3  2.02e-80     252.0         1
#> 31:   293     4  7.28e-74     235.0         1
#> 32:    98     3 5.42e-194     537.0         1
#> 33:   184    11  4.93e-43     149.0         1
#>      mism  gaps   e_value bit_score direction

## Filter for best hits and print results
bh_comparison <- best_hit(blast_comparison)
print_comparison(bh_comparison)
#>     start1  end1 start2  end2          name1          name2 per_id aln_len
#>      <int> <int>  <int> <int>         <char>         <char>  <num>   <int>
#>  1:    117   502      1   385 HKIDPCCJ_00001 LABPAKCO_00001   58.7     387
#>  2:      1   376      1   379 HKIDPCCJ_00002 LABPAKCO_00002   56.5     379
#>  3:     19   651     12   645 HKIDPCCJ_00004 LABPAKCO_00005   74.6     634
#>  4:     11   811     10   812 HKIDPCCJ_00005 LABPAKCO_00006   62.8     803
#>  5:      1   172      1   190 HKIDPCCJ_00007 LABPAKCO_00009   55.8     197
#>  6:     93   535     67   509 HKIDPCCJ_00029 LABPAKCO_00168   28.7     453
#>  7:      1   637      1   644 HKIDPCCJ_00034 LABPAKCO_00103   56.1     645
#>  8:    100   531     78   511 HKIDPCCJ_00038 LABPAKCO_00168   24.1     456
#>  9:      8   338      4   334 HKIDPCCJ_00047 LABPAKCO_00854   54.1     333
#> 10:      1   442      1   443 HKIDPCCJ_00048 LABPAKCO_01328   33.8     444
#> 11:      1   379      1   378 HKIDPCCJ_00049 LABPAKCO_00839   72.5     382
#>      mism  gaps   e_value bit_score direction
#>     <int> <int>     <num>     <num>     <num>
#>  1:   157     2 1.32e-153       440         1
#>  2:   162     1 1.18e-140       401         1
#>  3:   160     1  0.00e+00       946         1
#>  4:   297     1  0.00e+00       989         1
#>  5:    55     2  2.90e-56       172         1
#>  6:   303    10  3.22e-49       174         1
#>  7:   274     5 7.15e-256       715         1
#>  8:   300     9  3.02e-28       114         1
#>  9:   149     3 6.03e-110       322         1
#> 10:   291     3  2.02e-80       252         1
#> 11:    98     3 5.42e-194       537         1