Class: ImportDataset::DarwinCore::Checklist

Inherits:
ImportDataset::DarwinCore show all
Defined in:
app/models/import_dataset/darwin_core/checklist.rb

Constant Summary collapse

MINIMUM_FIELD_SET =
["taxonID", "scientificName", "parentNameUsageID"]

Constants inherited from ImportDataset::DarwinCore

CHECKLIST_ROW_TYPE, OCCURRENCES_ROW_TYPE

Instance Attribute Summary

Attributes inherited from ImportDataset

#description, #metadata, #source_content_type, #source_file_name, #source_file_size, #source_updated_at, #status

Instance Method Summary collapse

Methods inherited from ImportDataset::DarwinCore

#add_filters, #check_field_set, #core_records_are_readable, #core_records_fields, create_with_subtype_detection, default_if_absent, #default_nomenclatural_code, #destroy_namespace, #get_col_sep, #get_core_record_identifier_namespace, #get_dwc_default_values, #get_dwc_headers, #get_dwc_records, #get_field_mapping, #get_fields_mapping, #get_normalized_dwc_term, #get_quote_char, #get_records, #import, #initialize, #progress, #set_import_settings, #stage, #start_import, #stop_import

Methods inherited from ImportDataset

#delete_origin_relationships, #stage

Methods included from Shared::OriginRelationship

#new_objects, #old_objects, #reject_origin_relationships, #set_origin

Methods included from Shared::IsData

#errors_excepting, #full_error_messages_excepting, #identical, #is_community?, #is_destroyable?, #is_editable?, #is_in_use?, #is_in_users_projects?, #metamorphosize, #similar

Methods included from Housekeeping

#has_polymorphic_relationship?

Methods inherited from ApplicationRecord

transaction_with_retry

Constructor Details

This class inherits a constructor from ImportDataset::DarwinCore

Instance Method Details

#add_error_message(record, column_name, message) ⇒ Object (private)

Parameters:

  • column_name (String, Symbol)
  • record: (Hash)

    The record hash to add the error message to

  • message (String)


309
310
311
312
313
314
315
316
317
# File 'app/models/import_dataset/darwin_core/checklist.rb', line 309

def add_error_message(record, column_name, message)
  record[:error_data] ||= { messages: {} }

  if (arry = record.dig(:error_data, :messages, column_name.to_sym))
    arry << message
  else
    record[:error_data][:messages][column_name.to_sym] = [message]
  end
end

#core_records_classObject

if taxonomicStatus is “obsolete combination”, and it is an original combination, then don’t create a protonym. the valid combination will create the original combination relationship when it is processed.

if it’s not the original combination, make it a dependent of the valid/current name (so the protonym is created) and then make a new combination as recorded.

valid/current names don’t have to be valid, they could be a synonym or homonym. Importantly, each protonym should have only one current name.

If status is homonym, set the status of the name to homonym (DwC doesn’t give us the info to assert what it’s a homonym of), and

acceptedNameUsage may either be the replacement name (in the case of a homonym) or the valid name (in the case of a synonym)



24
25
26
# File 'app/models/import_dataset/darwin_core/checklist.rb', line 24

def core_records_class
  DatasetRecord::DarwinCore::Taxon
end

#core_records_identifier_nameObject



28
29
30
# File 'app/models/import_dataset/darwin_core/checklist.rb', line 28

def core_records_identifier_name
  'taxonID'
end

#perform_stagingObject

Stages core (Taxon) records and all extension records.



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
# File 'app/models/import_dataset/darwin_core/checklist.rb', line 35

def perform_staging
  records, headers = get_records(source.path)

  update!(metadata:
    .merge({
       core_headers: headers[:core],
    extensions_headers: headers[:extensions]
     })
  )

  parse_results_ary = Biodiversity::Parser.parse_ary(records[:core].map { |r| r['scientificName'] || '' })

  # hash of taxonID, record metadata
  records_lut = {}

  # hash of row index, record metadata
  core_records = records[:core].each_with_index.map do |record, index|
    records_lut[record['taxonID']] = {
      index:,
      type: nil, # will be protonym or combination
      dependencies: [],
      dependants: [],
      synonyms: [],
      synonym_of: nil, # index of current/valid name
      replacing_valid_name: nil, # taxonID of current/valid name, if record is a homonym or synonym
      is_hybrid: nil,
      is_synonym: nil,
      has_external_accepted_name: nil, # could be homonym or synonym, either way protonym is not valid. will use taxonomicStatus to determine the kind of relationship
      original_combination: nil, # taxonID of original combination
      create_original_combination: true,    # default to creating an original combination, is set to false if missing
      protonym_taxon_id: nil,
      parent: record['parentNameUsageID'],
      src_data: record
    }
  end

  # PROCESS OVERVIEW
  # if current name is valid, acceptedNameUsageID will be inside the original combination group, use that row for the protonym
  # if current name is synonym or homonym, acceptedNameUsageID won't be in the group, but use the synonym/homonym row for creating the protonym
  # if synonym has different rank and parent from accepted name, find a select a name in the group that does for the protonym
  #
  # make combination relationships for other names in group
  # make other names dependent on valid name

  # if group is a synonym, set record[:synonym_of] to index of current name

  #
  # Create original combination relationship for each key in original_combination_groups
  # The protonym should be dependent on the parent of the original combination if it's a subsequent combination

  # identify protonyms by grouping by original combination
  original_combination_groups = {}

  core_records.each_with_index do |record, index|

    # TODO handle when originalNameUsageID is not present

    if record[:src_data]['originalNameUsageID'].blank?
      record[:src_data]['originalNameUsageID'] = record[:src_data]['taxonID']
      record[:create_original_combination] = false # we assumed, don't make the relationship during import
    end

    if records_lut[record[:src_data]['originalNameUsageID']].nil?
      add_error_message(record, :originalNameUsageID, 'originalNameUsageID not found in dataset')
      next
    end
    oc_index = records_lut[record[:src_data]['originalNameUsageID']][:index]

    # misspellings are treated as separate protonyms, so don't bundle them in original combination with the correct spelling
    # "original misspelling" is also treated this way

    # if records_lut[record[:src_data]['taxonomicStatus']].nil?
    #   add_error_message(record, :taxonomicStatus, 'taxonomicStatus not found in dataset')
    #   next
    # end

    if record[:src_data]['taxonomicStatus'] && record[:src_data]['taxonomicStatus'].include?('misspelling')
      oc_index = index
    end

    original_combination_groups[oc_index] ||= []
    original_combination_groups[oc_index] << index

  end

  # TODO: Move to Constant?
  current_taxonomic_status = Set['valid', 'homonym', 'synonym', 'excluded', 'unidentifiable', 'incertae sedis', 'unavailable'].freeze

  # make combinations dependent on the protonym of each OC group
  original_combination_groups.each do |oc_index, name_items|

    if records_lut[core_records[oc_index][:src_data]['acceptedNameUsageID']].nil?
      name_items.each do |i|
        add_error_message(core_records[i], :acceptedNameUsageID, 'acceptedNameUsageID not found in dataset')
      end
      next
    end

    if name_items.size > 1
      # find the valid name of the group, first by seeing if acceptedNameUsageID is in group, otherwise check against list of known current statuses
      current_item = nil

      # Find accepted name of original combination of group (accepted name will always be the same for all items in a group)
      # and see if it's one of the names in the group
      accepted_name_index = records_lut[core_records[oc_index][:src_data]['acceptedNameUsageID']][:index]

      # if the accepted name is in the group, use it for creating the protonym
      # if it's not in the group, search the statuses of the items to find most eligible name (this happens with synonyms and homonyms)
      if name_items.include? accepted_name_index
        current_item = accepted_name_index
      else
        name_items.each do |index|
          break unless current_item.nil?

          # if synonym, make sure parent and rank are the same as the valid name
          # if they aren't find a name that does match
          if core_records[index][:src_data]['taxonomicStatus'] == 'synonym'
            valid_name_id = core_records[index][:src_data]['acceptedNameUsageID']
            if records_lut[valid_name_id][:src_data]['taxonRank'] == core_records[index][:src_data]['taxonRank'] && records_lut[valid_name_id][:src_data]['parentNameUsageID'] == core_records[index][:src_data]['parentNameUsageID']
              current_item = index
              core_records[current_item][:is_synonym] = true
              break
            else
              name_items.each do |index2|
                if records_lut[valid_name_id][:src_data]['taxonRank'] == core_records[index2][:src_data]['taxonRank'] && records_lut[valid_name_id][:src_data]['parentNameUsageID'] == core_records[index2][:src_data]['parentNameUsageID']
                  current_item = index2
                  core_records[current_item][:is_synonym] = true
                  break
                end
              end

              # at this point, parent/rank don't match, but there aren't any names that do.
              # let's use the name with the synonym status
              current_item = index
              core_records[current_item][:is_synonym] = true
            end

          elsif current_taxonomic_status.include? core_records[index][:src_data]['taxonomicStatus']
            current_item = index

            break
          end
        end

        # TODO handle if no names in group are marked as current / all are obsolete combinations
        if current_item.nil?
          current_item = name_items.first
        end

        core_records[current_item][:has_external_accepted_name] = true

        replacement_taxon_id = core_records[current_item][:src_data]['acceptedNameUsageID']
        core_records[current_item][:replacing_valid_name] = replacement_taxon_id
        core_records[current_item][:dependencies] << records_lut[replacement_taxon_id][:index]
        records_lut[replacement_taxon_id][:dependants] << current_item

      end

      current_record = core_records[current_item]

      current_record[:type] = :protonym
      current_record[:dependants].concat name_items.reject { |i| i == current_item }
      current_record[:protonym_taxon_id] = current_record[:src_data]['taxonID']

      current_record[:original_combination] = current_record[:src_data]['originalNameUsageID']

      # make other names combinations, dependants of current name
      name_items.reject { |i| i == current_item }.each do |index|
        core_records[index][:type] = :combination
        core_records[index][:dependencies] << current_item
        core_records[index][:protonym_taxon_id] = current_record[:src_data]['taxonID']
      end

      # make protonym depend on original combination's parent, if protonym is not the original combination
      # do not make valid record depend on self if OC's parent is the valid name. Ex: Aus with OC Aus (Aus)
      if core_records[oc_index][:parent].present? && current_record[:index] != oc_index && (core_records[oc_index][:parent] != current_record[:src_data]['taxonID'])
        current_record[:dependencies] << records_lut[core_records[oc_index][:parent]][:index]
        records_lut[core_records[oc_index][:parent]][:dependants] << current_record[:index]
      end

    else
      # if original combination is only name, make it the protonym
      # TODO is it better to replace name_items.first with oc_index?
      current_record = core_records[name_items.first]
      current_record[:type] = :protonym
      current_record[:original_combination] = current_record[:src_data]['taxonID']
      current_record[:protonym_taxon_id] = current_record[:src_data]['taxonID']

      # see if protonym is synonym (or even homonym?), and set replacing_valid_name if so
      if current_record[:src_data]['acceptedNameUsageID'] != current_record[:src_data]['taxonID']
        replacement_taxon_id = current_record[:src_data]['acceptedNameUsageID']
        current_record[:replacing_valid_name] = replacement_taxon_id
        current_record[:has_external_accepted_name] = true
        dependency = records_lut.dig(replacement_taxon_id, :index)
        current_record[:dependencies] << dependency if dependency
        records_lut.dig(replacement_taxon_id, :dependants)&.push(current_record[:index])

        current_record[:is_synonym] = (current_record[:src_data]['taxonomicStatus'] == 'synonym')
      end

    end
  end

  core_records.each_with_index do |record, index|
    accepted_name_usage = records_lut[record[:src_data]['acceptedNameUsageID']]

    unless accepted_name_usage
      # TODO are we already checking this higher up?
      add_error_message(record, :acceptedNameUsageID, "acceptedNameUsageID '#{record[:src_data]["acceptedNameUsageID"]}' not found")
    end

    record[:parent] = nil if record[:parent].blank?

    parse_results = parse_results_ary[index]

    record[:is_hybrid] = !!parse_results[:hybrid]

    # set type as combination or protonym based on authorship being in parentheses
    unless parse_results[:details]
      record[:type] = :unknown
      add_error_message(record, :scientificName, "Scientific name #{record[:src_data][:scientificName]} could not be parsed")
    end

    unless record[:parent].nil?
      if records_lut[record[:parent]]
        parent_index = records_lut[record[:parent]][:index]
        record[:dependencies] << parent_index
        core_records[parent_index][:dependants] << record[:index]
      else
        add_error_message(record, :parentNameUsageID, 'parentNameUsageID not found in dataset')
      end
    end
  end

  # replace dependencies and dependants index values with taxonID values
  core_records.each do |record|
    record[:dependants].map! { |i| core_records[i][:src_data]['taxonID'] }.uniq!
    record[:dependencies].map! { |i| core_records[i][:src_data]['taxonID'] }.uniq!
  end

  # create new dataset record for each row and mark items as ready
  core_records.each do |record|
    dwc_taxon = DatasetRecord::DarwinCore::Taxon.new(import_dataset: self)
    dwc_taxon.initialize_data_fields(record[:src_data].map { |_, v| v })
    dwc_taxon.status = !record[:error_data] && record[:dependencies] == [] && record[:parent].nil? ? 'Ready' : 'NotReady'
    record.delete(:src_data)
    dwc_taxon. = record

    dwc_taxon.save!
  end

  records[:extensions].each do |extension_type, extension_records|
    extension_records.each do |record|
      dwc_extension = DatasetRecord::DarwinCore::Extension.new(import_dataset: self)
      dwc_extension.initialize_data_fields(record.map { |_, v| v })
      dwc_extension.status = 'Unsupported'
      dwc_extension. = { type: extension_type }

      dwc_extension.save!
    end
  end
end

#use_existing_hierarchy?Boolean

rubocop:enable

Returns:

  • (Boolean)


300
301
302
# File 'app/models/import_dataset/darwin_core/checklist.rb', line 300

def use_existing_hierarchy?
  !!self..dig('import_settings', 'use_existing_taxon_hierarchy')
end