Class: ImportDataset::DarwinCore

Inherits:
ImportDataset show all
Defined in:
app/models/import_dataset/darwin_core.rb

Direct Known Subclasses

Checklist, Occurrences, Unknown

Defined Under Namespace

Classes: Checklist, Occurrences, Unknown

Constant Summary collapse

CHECKLIST_ROW_TYPE =
'http://rs.tdwg.org/dwc/terms/Taxon'.freeze
OCCURRENCES_ROW_TYPE =
'http://rs.tdwg.org/dwc/terms/Occurrence'.freeze

Instance Attribute Summary

Attributes inherited from ImportDataset

#description, #metadata, #source_content_type, #source_file_name, #source_file_size, #source_updated_at, #status

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from ImportDataset

#delete_origin_relationships

Methods included from Shared::OriginRelationship

#new_objects, #old_objects, #reject_origin_relationships, #set_origin

Methods included from Shared::IsData

#errors_excepting, #full_error_messages_excepting, #identical, #is_community?, #is_destroyable?, #is_editable?, #is_in_use?, #is_in_users_projects?, #metamorphosize, #similar

Methods included from Housekeeping

#has_polymorphic_relationship?

Methods inherited from ApplicationRecord

transaction_with_retry

Constructor Details

#initialize(params) ⇒ DarwinCore

Returns a new instance of DarwinCore.



13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'app/models/import_dataset/darwin_core.rb', line 13

def initialize(params)
  import_settings = params&.delete(:import_settings)
  super(params)

  self. = {
    core_headers: [],
    namespaces: {
      core: nil,
      eventID: nil
    }
  }

  set_import_settings(import_settings || {})
end

Class Method Details

.create_with_subtype_detection(params) ⇒ Checklist, ...

Returns the appropriate ImportDataset::DarwinCore subclass instantiated (not saved) for the supplied params

Parameters:

  • file_path (string)

    Path to DwC-A file

Returns:



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'app/models/import_dataset/darwin_core.rb', line 36

def self.create_with_subtype_detection(params)
  core_type = nil

  return Unknown.new unless params[:source]

  begin
    path = params[:source].tempfile.path
    if path =~ /\.zip\z/i
      dwc = ::DarwinCore.new(path)
      core_type = dwc.core.data[:attributes][:rowType]

      ### Check all files are readable
      [dwc.core, *dwc.extensions].each do |table|
        table.read { |data, errors| raise 'Errors found when reading data' unless errors.empty? }
      end
    else
      if path =~ /\.(xlsx?|ods)\z/i
        headers = CSV.parse(Roo::Spreadsheet.open(path).to_csv, headers: true, header_converters: lambda {|f| f.strip}).headers
      else
        col_sep = default_if_absent(params.dig(:import_settings, :col_sep), "\t")
        quote_char = default_if_absent(params.dig(:import_settings, :qoute_char), '"')
        headers = CSV.read(path, headers: true, col_sep: col_sep, quote_char: quote_char, encoding: 'bom|utf-8', header_converters: lambda {|f| f.strip}).headers
      end

      row_type = params.dig(:import_settings, :row_type)
      if row_type
        core_type = row_type
      elsif headers.include? 'occurrenceID'
        core_type = OCCURRENCES_ROW_TYPE
      elsif headers.include? 'taxonID'
        core_type = CHECKLIST_ROW_TYPE
      end
    end
  rescue Errno::ENOENT, RuntimeError => e # TODO: dwc-archive gem should probably detect missing (or wrongly mapped) files and raise its own exception
    return Unknown.new(params.merge({error_message: "#{e.message}"}))
  end

  case core_type
  when OCCURRENCES_ROW_TYPE
    Occurrences.new(params)
  when CHECKLIST_ROW_TYPE
    Checklist.new(params)
  else
    Unknown.new(params.merge({error_message: "unknown DwC-A core type '#{core_type}'."}))
  end
end

.default_if_absent(value, default) ⇒ Object (private)



310
311
312
313
# File 'app/models/import_dataset/darwin_core.rb', line 310

def self.default_if_absent(value, default)
  return default if value.nil? || value.empty?
  value
end

Instance Method Details

#add_filters(records, filters) ⇒ Object (private)



344
345
346
347
348
349
# File 'app/models/import_dataset/darwin_core.rb', line 344

def add_filters(records, filters)
  filters&.each do |key, value|
    records = records.where(id: core_records_fields.at(key.to_i).having_value(value).select(:dataset_record_id))
  end
  records
end

#check_field_setObject (private)



365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
# File 'app/models/import_dataset/darwin_core.rb', line 365

def check_field_set
  if source.staged?
    if source.staged_path =~ /\.zip\z/i
      headers = get_dwc_headers(::DarwinCore.new(source.staged_path).core)
    else
      if source.staged_path =~ /\.(xlsx?|ods)\z/i
        headers = CSV.parse(Roo::Spreadsheet.open(source.staged_path).to_csv, headers: true).headers
      else
        headers = CSV.read(source.staged_path, headers: true, col_sep: get_col_sep, quote_char: get_quote_char, encoding: 'bom|utf-8').headers
      end
    end

    missing_headers = self.class::MINIMUM_FIELD_SET - headers

    missing_headers.each do |header|
      errors.add(:source, "required field #{header} missing.")
    end
  end
end

#core_records_fieldsObject



28
29
30
# File 'app/models/import_dataset/darwin_core.rb', line 28

def core_records_fields
  dataset_record_fields.with_record_class(core_records_class)
end

#core_records_mapped_fieldsInteger

Returns the indexes of the mapped fields for the core records.

Returns:

  • (Integer)


85
86
87
# File 'app/models/import_dataset/darwin_core.rb', line 85

def core_records_mapped_fields
  core_records&.first&.get_mapped_fields(dwc_data_attributes) || []
end

#default_nomenclatural_codeObject



239
240
241
# File 'app/models/import_dataset/darwin_core.rb', line 239

def default_nomenclatural_code
  self..dig('import_settings', 'nomenclatural_code')&.downcase&.to_sym || :iczn
end

#destroy_namespaceObject (private)



340
341
342
# File 'app/models/import_dataset/darwin_core.rb', line 340

def destroy_namespace
  Namespace.find_by(id: ['identifier_namespace'])&.destroy # If in use or gone no deletion happens
end

#dwc_data_attributesObject (private)



385
386
387
388
389
390
391
392
393
394
395
# File 'app/models/import_dataset/darwin_core.rb', line 385

def dwc_data_attributes
  project.preferences['model_predicate_sets'].map do |model, predicate_ids|
    [model, Hash[
      *Predicate.where(id: predicate_ids)
        .select { |p| /^http:\/\/rs\.tdwg\.org\/dwc\/terms\/.*/ =~ p.uri }
        .map {|p| [p.uri.split('/').last, p]}
        .flatten
      ]
    ]
  end.to_h
end

#get_col_sepObject (private)



315
316
317
# File 'app/models/import_dataset/darwin_core.rb', line 315

def get_col_sep
  DarwinCore.default_if_absent(.dig('import_settings', 'col_sep'), "\t")
end

#get_core_record_identifier_namespaceObject



212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
# File 'app/models/import_dataset/darwin_core.rb', line 212

def get_core_record_identifier_namespace
  id = .dig('namespaces', 'core')

  if id.nil? || (@core_record_identifier_namespace ||= Namespace.find_by(id:)).nil?
    random = SecureRandom.hex(4)
    project_name = Project.find(Current.project_id).name

    namespace_name = "#{core_records_identifier_name} namespace for \"#{description}\" dataset in \"#{project_name}\" project [#{random}]"

    @core_record_identifier_namespace = Namespace.create!(
      name: namespace_name,
      short_name: "#{core_records_identifier_name}-#{random}",
      verbatim_short_name: core_records_identifier_name,
      delimiter: ':'
    )

    .deep_merge!({
      'namespaces' => {
        'core' => @core_record_identifier_namespace.id
      }
    })
    save!
  end

  @core_record_identifier_namespace
end

#get_dwc_default_values(table) ⇒ Object (private)



329
330
331
# File 'app/models/import_dataset/darwin_core.rb', line 329

def get_dwc_default_values(table)
  table.fields.select { |f| f.has_key? :default }
end

#get_dwc_headers(table) ⇒ Object (protected)



276
277
278
279
280
281
282
283
284
285
286
287
# File 'app/models/import_dataset/darwin_core.rb', line 276

def get_dwc_headers(table)
  headers = []

  headers[table.id[:index]] = 'id' if table.id
  table.fields.each { |f| headers[f[:index]] = get_normalized_dwc_term(f) if f[:index] }

  table.read_header.first&.each_with_index { |f, i| headers[i] ||= f.strip }

  get_dwc_default_values(table).each.with_index(headers.length) { |f, i| headers[i] = get_normalized_dwc_term(f) }

  headers
end

#get_dwc_records(table) ⇒ Object (protected)



289
290
291
292
293
294
295
296
297
298
299
300
301
302
# File 'app/models/import_dataset/darwin_core.rb', line 289

def get_dwc_records(table)
  records = []
  headers = get_dwc_headers(table)

  records = table.read.first.map do |row|
    record = {}
    row.each_with_index { |v, i| record[headers[i]] = v }
    defaults = get_dwc_default_values(table)
    defaults.each.with_index(headers.length - defaults.length) { |f, i| record[headers[i]] = f[:default] }
    record
  end

  return records
end

#get_field_mapping(field_name) ⇒ Object (protected)



304
305
306
# File 'app/models/import_dataset/darwin_core.rb', line 304

def get_field_mapping(field_name)
  get_fields_mapping[field_name.to_s.downcase]
end

#get_fields_mappingObject (private)



323
324
325
326
327
# File 'app/models/import_dataset/darwin_core.rb', line 323

def get_fields_mapping
  @fields_mapping ||= ['core_headers']
    .reject(&:nil?)
    .each.with_index.inject({}) { |m, (h, i)| m.merge({ h.downcase => i, i => h}) }
end

#get_normalized_dwc_term(field) ⇒ Object (private)



333
334
335
336
337
338
# File 'app/models/import_dataset/darwin_core.rb', line 333

def get_normalized_dwc_term(field)
  # TODO: Think what to do about complex namespaces like "/std/Iptc4xmpExt/2008-02-29/" (currently returning the full URI as header)
  term = field[:term].match(/\/([^\/]+)\/terms\/.*(?<=\/)([^\/]+)\/?$/)
  #headers[field[:index]] = term ? term[1..2].join(":") : field[:term]
  term ? term[2] : field[:term]
end

#get_quote_charObject (private)



319
320
321
# File 'app/models/import_dataset/darwin_core.rb', line 319

def get_quote_char
  DarwinCore.default_if_absent(.dig('import_settings', 'quote_char'), '"')
end

#get_records(path) ⇒ Object (protected)



245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# File 'app/models/import_dataset/darwin_core.rb', line 245

def get_records(path)
  records = { core: [], extensions: {} }
  headers = { core: [], extensions: {} }

  if path =~ /\.zip\z/i
    dwc = ::DarwinCore.new(path)

    headers[:core] = get_dwc_headers(dwc.core)
    records[:core] = get_dwc_records(dwc.core)

    dwc.extensions.each do |extension|
      type = extension.properties[:rowType]
      records[:extensions][type] = get_dwc_records(extension)
      headers[:extensions][type] = get_dwc_headers(extension)
    end
  elsif path =~ /\.(csv|txt|tsv|xlsx?|ods)\z/i
    # only strip whitespace on the headers with lambda functions because whitespace is stripped from the data elsewhere
    if path =~ /\.(csv|txt|tsv)\z/i
      records[:core] = CSV.read(path, headers: true, col_sep: get_col_sep, quote_char: get_quote_char, encoding: 'bom|utf-8', header_converters: lambda {|f| f&.strip})
    else
      records[:core] = CSV.parse(Roo::Spreadsheet.open(path).to_csv, headers: true, header_converters: lambda {|f| f&.strip})
    end
    records[:core] = records[:core].map { |r| r.to_h }
    headers[:core] = records[:core].first.to_h.keys
  else
    raise 'Unsupported input format'
  end

  return records, headers
end

#import(max_time, max_records, retry_errored: nil, filters: nil, record_id: nil) ⇒ Hash

Returns the updated dataset records. Do not call if there are changes that have not been persisted

Parameters:

  • max_time (Integer)

    Maximum time to spend processing records.

  • max_records (Integer)

    Maximum number of records to be processed.

  • retry_errored (Boolean) (defaults to: nil)

    Also looks up for errored records when importing (default is looking for records with Status=Ready)

  • filters (Hash) (defaults to: nil)

    (Column-index, value) pairs of filters to apply when searching for records to import (default none)

  • record_id (Integer) (defaults to: nil)

    Indicates the record to be imported (default none). When used filters are ignored.

Returns:

  • (Hash)


134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# File 'app/models/import_dataset/darwin_core.rb', line 134

def import(max_time, max_records, retry_errored: nil, filters: nil, record_id: nil)
  imported = []

  lock_time = Time.now
  old_uuid = self.['import_uuid']
  start_import do
    lock_time = Time.now - lock_time
    filters = self.['import_filters'] if filters.nil?
    retry_errored = self.['import_retry_errored'] if retry_errored.nil?
    start_id = self.['import_start_id'] if retry_errored

    status = ['Ready']
    status << 'Errored' if retry_errored
    records = add_filters(core_records.where(status:), filters).order(:id).limit(max_records) #.preload_fields

    records = records.where(id: start_id..) if start_id
    records = core_records.where(id: record_id, status: %w{Ready Errored}) if record_id

    records = records.all
    start_time = Time.now - lock_time


    records.each do |record|
      imported << record.import(dwc_data_attributes)

      break if 1000.0*(Time.now - start_time).abs > max_time
    end

    if imported.any? && record_id.nil?
      reload
      self..merge!({
        'import_start_id' => imported.last&.id + 1,
        'import_filters' => filters,
        'import_retry_errored' => retry_errored
      })
      save!

      new_uuid = self.['import_uuid']
      ImportDatasetImportJob.perform_later(self, new_uuid, max_time, max_records) unless old_uuid == new_uuid
    else
      self.stop_import
    end
  end

  imported
end

#progress(filters: nil) ⇒ Hash

Returns a hash with the record counts grouped by status

Returns:

  • (Hash)


183
184
185
# File 'app/models/import_dataset/darwin_core.rb', line 183

def progress(filters: nil)
  add_filters(core_records, filters).group(:status).count
end

#set_import_settings(import_settings) ⇒ Object

Sets import settings for this dataset



205
206
207
208
209
210
# File 'app/models/import_dataset/darwin_core.rb', line 205

def set_import_settings(import_settings)
  ['import_settings'] ||= {}
  import_settings.each { |k, v| ['import_settings'].merge!({k => v}) }

  ['import_settings']
end

#stageObject

Stages DwC-A records into DB.



188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# File 'app/models/import_dataset/darwin_core.rb', line 188

def stage
  if status == 'Staging' # ActiveJob being retried could cause this state
    transaction do
      core_records_fields.delete_all
      dataset_records.delete_all
    end
  end

  update!(status: 'Staging') if status == 'Uploaded'

  if status != 'Ready'
    perform_staging
    update!(status: 'Ready')
  end
end

#start_import(&block) ⇒ String

Sets up import dataset for import and returns UUID. If already started same UUID is returned (unless last activity was more than 10 minutes ago). Do not call if there are changes that have not been persisted

Returns:

  • (String)


92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'app/models/import_dataset/darwin_core.rb', line 92

def start_import(&block)
  with_lock do
    case self.status
    when 'Ready'
      self.status = 'Importing'
      self.['import_uuid'] = SecureRandom.uuid
    when 'Importing'
      self.['import_uuid'] = SecureRandom.uuid if self.updated_at < 10.minutes.ago
    else
      raise 'Invalid initial state'
    end
    save!

    yield if block_given?
  end

  self.['import_uuid']
end

#stop_importObject

Sets import dataset to stop importing data. Do not call if there are changes that have not been persisted.



112
113
114
115
116
117
118
119
120
# File 'app/models/import_dataset/darwin_core.rb', line 112

def stop_import
  with_lock do
    if self.status == 'Importing'
      self.status = 'Ready'
      self..except!('import_uuid', 'import_start_id', 'import_filters', 'import_retry_errored')
      save!
    end
  end
end

#well_formedObject (private)



351
352
353
354
355
356
357
358
359
360
361
362
363
# File 'app/models/import_dataset/darwin_core.rb', line 351

def well_formed
  begin
    headers = get_records(source.staged_path).last[:core]
    duplicates = headers.compact.map(&:downcase).tally.select { |_, count| count > 1 }.keys

    if duplicates.any?
      errors.add(:source, "Duplicate headers found: #{duplicates.join(', ')}")
    end
  rescue RuntimeError
    errors.add(:source, 'A problem occurred when reading the data file. If this is a text file please make sure the selected string and field delimiters are correct.')
  end
  true
end