Class: Match::Otu::TaxonName

Inherits:
Object
  • Object
show all
Defined in:
lib/match/otu/taxon_name.rb

Constant Summary collapse

MAX_NAMES =
1000
MATCHABLE_COLUMNS =
[:cached, :cached_secondary_homonym, :cached_primary_homonym].freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(names:, project_id:, levenshtein_distance: 0, taxon_name_id: nil, resolve_synonyms: false, try_without_subgenus: false) ⇒ TaxonName

Returns a new instance of TaxonName.

Parameters:

  • names (Array<String>)

    array of name strings to match

  • project_id (Integer)
  • levenshtein_distance (Integer) (defaults to: 0)

    0 for exact, 1-8 for fuzzy

  • taxon_name_id (Integer, nil) (defaults to: nil)

    scope matches to descendants of this TaxonName

  • resolve_synonyms (Boolean) (defaults to: false)

    when true, resolve synonyms to valid names and return their OTUs

  • try_without_subgenus (Boolean) (defaults to: false)

    when true and cached match fails, try cached_secondary_homonym then cached_primary_homonym



39
40
41
42
43
44
45
46
# File 'lib/match/otu/taxon_name.rb', line 39

def initialize(names:, project_id:, levenshtein_distance: 0, taxon_name_id: nil, resolve_synonyms: false, try_without_subgenus: false)
  @names = names.first(MAX_NAMES)
  @project_id = project_id
  @levenshtein_distance = levenshtein_distance.to_i
  @taxon_name_id = taxon_name_id
  @resolve_synonyms = resolve_synonyms
  @try_without_subgenus = try_without_subgenus
end

Instance Attribute Details

#levenshtein_distanceObject (readonly)

Returns the value of attribute levenshtein_distance.



31
32
33
# File 'lib/match/otu/taxon_name.rb', line 31

def levenshtein_distance
  @levenshtein_distance
end

#namesObject (readonly)

Returns the value of attribute names.



31
32
33
# File 'lib/match/otu/taxon_name.rb', line 31

def names
  @names
end

#project_idObject (readonly)

Returns the value of attribute project_id.



31
32
33
# File 'lib/match/otu/taxon_name.rb', line 31

def project_id
  @project_id
end

#resolve_synonymsObject (readonly)

Returns the value of attribute resolve_synonyms.



31
32
33
# File 'lib/match/otu/taxon_name.rb', line 31

def resolve_synonyms
  @resolve_synonyms
end

#taxon_name_idObject (readonly)

Returns the value of attribute taxon_name_id.



31
32
33
# File 'lib/match/otu/taxon_name.rb', line 31

def taxon_name_id
  @taxon_name_id
end

#try_without_subgenusObject (readonly)

Returns the value of attribute try_without_subgenus.



31
32
33
# File 'lib/match/otu/taxon_name.rb', line 31

def try_without_subgenus
  @try_without_subgenus
end

Instance Method Details

#base_scopeActiveRecord::Relation (private)

Build the base TaxonName scope, optionally constrained to descendants of taxon_name_id.

Returns:

  • (ActiveRecord::Relation)


148
149
150
151
152
153
154
155
156
157
158
# File 'lib/match/otu/taxon_name.rb', line 148

def base_scope
  scope = ::TaxonName.where(project_id: project_id)

  if taxon_name_id.present?
    scope = scope
      .joins('JOIN taxon_name_hierarchies ON taxon_names.id = taxon_name_hierarchies.descendant_id')
      .where(taxon_name_hierarchies: { ancestor_id: taxon_name_id })
  end

  scope
end

#callArray<Hash>

Returns:

  • (Array<Hash>)


49
50
51
52
53
54
55
56
57
58
# File 'lib/match/otu/taxon_name.rb', line 49

def call
  unique_names = names.uniq
  match_cache = {}

  unique_names.each do |name|
    match_cache[name] = match_name(name)
  end

  names.map { |name| match_cache[name].merge(scientific_name: name) }
end

#find_taxon_names(name, column: :cached) ⇒ Array<TaxonName> (private)

Parameters:

  • name (String)
  • column (Symbol) (defaults to: :cached)

    :cached, :cached_secondary_homonym, or :cached_primary_homonym

Returns:



100
101
102
103
104
105
106
# File 'lib/match/otu/taxon_name.rb', line 100

def find_taxon_names(name, column: :cached)
  if levenshtein_distance > 0
    find_taxon_names_fuzzy(name, column:)
  else
    find_taxon_names_exact(name, column:)
  end
end

#find_taxon_names_exact(name, column: :cached) ⇒ Array<TaxonName> (private)

Parameters:

  • name (String)
  • column (Symbol) (defaults to: :cached)

Returns:



111
112
113
114
# File 'lib/match/otu/taxon_name.rb', line 111

def find_taxon_names_exact(name, column: :cached)
  scope = base_scope
  scope.where(column => name).to_a
end

#find_taxon_names_fuzzy(name, column: :cached) ⇒ Array<TaxonName> (private)

Parameters:

  • name (String)
  • column (Symbol) (defaults to: :cached)

Returns:

Raises:

  • (ArgumentError)


121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# File 'lib/match/otu/taxon_name.rb', line 121

def find_taxon_names_fuzzy(name, column: :cached)
  raise ArgumentError, "Invalid column: #{column}" unless MATCHABLE_COLUMNS.include?(column)

  scope = base_scope
  truncated_name = name[0..254]
  distance = [levenshtein_distance, 8].min
  qualified_column = "taxon_names.#{column}"

  scope
    .where(
      "levenshtein(left(#{qualified_column}, 255), ?) <= ?",
      truncated_name,
      distance
    )
    .order(
      Arel.sql(
        ::TaxonName.sanitize_sql_array(
          ["levenshtein(left(#{qualified_column}, 255), ?)", truncated_name]
        )
      )
    )
    .limit(10)
    .to_a
end

#match_name(name) ⇒ Hash (private)

Parameters:

  • name (String)

Returns:

  • (Hash)


64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/match/otu/taxon_name.rb', line 64

def match_name(name)
  taxon_names = find_taxon_names(name)

  if taxon_names.empty? && try_without_subgenus
    taxon_names = find_taxon_names(name, column: :cached_secondary_homonym)
    if taxon_names.empty?
      taxon_names = find_taxon_names(name, column: :cached_primary_homonym)
    end
  end

  return { taxon_name_id: nil, taxon_name: nil, otus: [], ambiguous: false, matched: false } if taxon_names.empty?

  ranked = rank_taxon_names(taxon_names)
  best = ranked.first

  taxon_name_for_otus = best

  if resolve_synonyms && best.cached_valid_taxon_name_id != best.id
    valid = ::TaxonName.where(project_id: project_id).find_by(id: best.cached_valid_taxon_name_id)
    taxon_name_for_otus = valid if valid
  end

  otus = ::Otu.where(project_id: project_id, taxon_name_id: taxon_name_for_otus.id).to_a

  {
    taxon_name_id: best.id,
    taxon_name: best,
    otus: otus,
    ambiguous: ranked.length > 1,
    matched: true
  }
end

#rank_taxon_names(taxon_names) ⇒ Array<TaxonName> (private)

Rank candidate TaxonNames:

1. Prefer those with OTUs
2. Prefer valid names

Parameters:

Returns:



165
166
167
168
169
170
171
172
173
174
175
# File 'lib/match/otu/taxon_name.rb', line 165

def rank_taxon_names(taxon_names)
  taxon_name_ids = taxon_names.map(&:id)
  ids_with_otus = ::Otu.where(project_id: project_id, taxon_name_id: taxon_name_ids).distinct.pluck(:taxon_name_id).to_set

  taxon_names.sort_by do |tn|
    [
      ids_with_otus.include?(tn.id) ? 0 : 1,
      tn.cached_valid_taxon_name_id == tn.id ? 0 : 1
    ]
  end
end