Class: ImportDataset::DarwinCore

Inherits:
ImportDataset show all
Defined in:
app/models/import_dataset/darwin_core.rb

Direct Known Subclasses

Checklist, Occurrences, Unknown

Defined Under Namespace

Classes: Checklist, Occurrences, Unknown

Constant Summary collapse

CHECKLIST_ROW_TYPE =
'http://rs.tdwg.org/dwc/terms/Taxon'.freeze
OCCURRENCES_ROW_TYPE =
'http://rs.tdwg.org/dwc/terms/Occurrence'.freeze

Instance Attribute Summary

Attributes inherited from ImportDataset

#description, #metadata, #source_content_type, #source_file_name, #source_file_size, #source_updated_at, #status

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from ImportDataset

#delete_origin_relationships

Methods included from Shared::OriginRelationship

#new_objects, #old_objects, #reject_origin_relationships, #set_origin

Methods included from Shared::IsData

#errors_excepting, #full_error_messages_excepting, #identical, #is_community?, #is_destroyable?, #is_editable?, #is_in_use?, #is_in_users_projects?, #metamorphosize, #similar

Methods included from Housekeeping

#has_polymorphic_relationship?

Methods inherited from ApplicationRecord

transaction_with_retry

Constructor Details

#initialize(params) ⇒ DarwinCore

Returns a new instance of DarwinCore.



11
12
13
14
15
16
17
18
19
20
21
22
23
24
# File 'app/models/import_dataset/darwin_core.rb', line 11

def initialize(params)
  import_settings = params&.delete(:import_settings)
  super(params)

  self. = {
    core_headers: [],
    namespaces: {
      core: nil,
      eventID: nil
    }
  }

  set_import_settings(import_settings || {})
end

Class Method Details

.create_with_subtype_detection(params) ⇒ Checklist, ...

Returns the appropriate ImportDataset::DarwinCore subclass instantiated (not saved) for the supplied params

Parameters:

  • file_path (string)

    Path to DwC-A file

Returns:



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'app/models/import_dataset/darwin_core.rb', line 34

def self.create_with_subtype_detection(params)
  core_type = nil

  return Unknown.new unless params[:source]

  begin
    path = params[:source].tempfile.path
    if path =~ /\.zip\z/i
      dwc = ::DarwinCore.new(path)
      core_type = dwc.core.data[:attributes][:rowType]

      ### Check all files are readable
      [dwc.core, *dwc.extensions].each do |table|
        table.read { |data, errors| raise 'Errors found when reading data' unless errors.empty? }
      end
    else
      if path =~ /\.(xlsx?|ods)\z/i
        headers = CSV.parse(Roo::Spreadsheet.open(path).to_csv, headers: true, header_converters: lambda {|f| f.strip}).headers
      else
        headers = CSV.read(path, headers: true, col_sep: "\t", quote_char: nil, encoding: 'bom|utf-8', header_converters: lambda {|f| f.strip}).headers
      end

      row_type = params.dig(:import_settings, :row_type)
      if row_type
        core_type = row_type
      elsif headers.include? 'occurrenceID'
        core_type = OCCURRENCES_ROW_TYPE
      elsif headers.include? 'taxonID'
        core_type = CHECKLIST_ROW_TYPE
      end
    end
  rescue Errno::ENOENT, RuntimeError => e # TODO: dwc-archive gem should probably detect missing (or wrongly mapped) files and raise its own exception
    return Unknown.new(params.merge({error_message: "#{e.message}"}))
  end

  case core_type
  when OCCURRENCES_ROW_TYPE
    Occurrences.new(params)
  when CHECKLIST_ROW_TYPE
    Checklist.new(params)
  else
    Unknown.new(params.merge({error_message: "unknown DwC-A core type '#{core_type}'."}))
  end
end

Instance Method Details

#add_filters(records, filters) ⇒ Object (private)



343
344
345
346
347
348
# File 'app/models/import_dataset/darwin_core.rb', line 343

def add_filters(records, filters)
  filters&.each do |key, value|
    records = records.where(id: core_records_fields.at(key.to_i).with_value(value).select(:dataset_record_id))
  end
  records
end

#core_records_fieldsObject



26
27
28
# File 'app/models/import_dataset/darwin_core.rb', line 26

def core_records_fields
  dataset_record_fields.with_record_class(core_records_class)
end

#default_nomenclatural_codeObject



238
239
240
# File 'app/models/import_dataset/darwin_core.rb', line 238

def default_nomenclatural_code
  self..dig('import_settings', 'nomenclatural_code')&.downcase&.to_sym || :iczn
end

#destroy_namespaceObject (private)



339
340
341
# File 'app/models/import_dataset/darwin_core.rb', line 339

def destroy_namespace
  Namespace.find_by(id: ['identifier_namespace'])&.destroy # If in use or gone no deletion happens
end

#get_col_sepObject (private)



314
315
316
# File 'app/models/import_dataset/darwin_core.rb', line 314

def get_col_sep
  return_default_if_absent(.dig("import_settings", "col_sep"), "\t")
end

#get_core_record_identifier_namespaceObject



211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# File 'app/models/import_dataset/darwin_core.rb', line 211

def get_core_record_identifier_namespace
  id = .dig('namespaces', 'core')

  if id.nil? || (@core_record_identifier_namespace ||= Namespace.find_by(id:)).nil?
    random = SecureRandom.hex(4)
    project_name = Project.find(Current.project_id).name

    namespace_name = "#{core_records_identifier_name} namespace for \"#{description}\" dataset in \"#{project_name}\" project [#{random}]"

    @core_record_identifier_namespace = Namespace.create!(
      name: namespace_name,
      short_name: "#{core_records_identifier_name}-#{random}",
      verbatim_short_name: core_records_identifier_name,
      delimiter: ':'
    )

    .deep_merge!({
      'namespaces' => {
        'core' => @core_record_identifier_namespace.id
      }
    })
    save!
  end

  @core_record_identifier_namespace
end

#get_dwc_default_values(table) ⇒ Object (private)



328
329
330
# File 'app/models/import_dataset/darwin_core.rb', line 328

def get_dwc_default_values(table)
  table.fields.select { |f| f.has_key? :default }
end

#get_dwc_headers(table) ⇒ Object (protected)



275
276
277
278
279
280
281
282
283
284
285
286
# File 'app/models/import_dataset/darwin_core.rb', line 275

def get_dwc_headers(table)
  headers = []

  headers[table.id[:index]] = 'id' if table.id
  table.fields.each { |f| headers[f[:index]] = get_normalized_dwc_term(f) if f[:index] }

  table.read_header.first&.each_with_index { |f, i| headers[i] ||= f.strip }

  get_dwc_default_values(table).each.with_index(headers.length) { |f, i| headers[i] = get_normalized_dwc_term(f) }

  headers
end

#get_dwc_records(table) ⇒ Object (protected)



288
289
290
291
292
293
294
295
296
297
298
299
300
301
# File 'app/models/import_dataset/darwin_core.rb', line 288

def get_dwc_records(table)
  records = []
  headers = get_dwc_headers(table)

  records = table.read.first.map do |row|
    record = {}
    row.each_with_index { |v, i| record[headers[i]] = v }
    defaults = get_dwc_default_values(table)
    defaults.each.with_index(headers.length - defaults.length) { |f, i| record[headers[i]] = f[:default] }
    record
  end

  return records
end

#get_field_mapping(field_name) ⇒ Object (protected)



303
304
305
# File 'app/models/import_dataset/darwin_core.rb', line 303

def get_field_mapping(field_name)
  get_fields_mapping[field_name.to_s.downcase]
end

#get_fields_mappingObject (private)



322
323
324
325
326
# File 'app/models/import_dataset/darwin_core.rb', line 322

def get_fields_mapping
  @fields_mapping ||= ['core_headers']
    .reject(&:nil?)
    .each.with_index.inject({}) { |m, (h, i)| m.merge({ h.downcase => i, i => h}) }
end

#get_normalized_dwc_term(field) ⇒ Object (private)



332
333
334
335
336
337
# File 'app/models/import_dataset/darwin_core.rb', line 332

def get_normalized_dwc_term(field)
  # TODO: Think what to do about complex namespaces like "/std/Iptc4xmpExt/2008-02-29/" (currently returning the full URI as header)
  term = field[:term].match(/\/([^\/]+)\/terms\/.*(?<=\/)([^\/]+)\/?$/)
  #headers[field[:index]] = term ? term[1..2].join(":") : field[:term]
  term ? term[2] : field[:term]
end

#get_quote_charObject (private)



318
319
320
# File 'app/models/import_dataset/darwin_core.rb', line 318

def get_quote_char
  return_default_if_absent(.dig("import_settings", "quote_char"), "\"")
end

#get_records(source) ⇒ Object (protected)



244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
# File 'app/models/import_dataset/darwin_core.rb', line 244

def get_records(source)
  records = { core: [], extensions: {} }
  headers = { core: [], extensions: {} }

  if source.path =~ /\.zip\z/i
    dwc = ::DarwinCore.new(source.path)

    headers[:core] = get_dwc_headers(dwc.core)
    records[:core] = get_dwc_records(dwc.core)

    dwc.extensions.each do |extension|
      type = extension.properties[:rowType]
      records[:extensions][type] = get_dwc_records(extension)
      headers[:extensions][type] = get_dwc_headers(extension)
    end
  elsif source.path =~ /\.(csv|txt|tsv|xlsx?|ods)\z/i
    # only strip whitespace on the headers with lambda functions because whitespace is stripped from the data elsewhere
    if source.path =~ /\.(csv|txt|tsv)\z/i
      records[:core] = CSV.read(source.path, headers: true, col_sep: get_col_sep, quote_char: get_quote_char, encoding: 'bom|utf-8', header_converters: lambda {|f| f&.strip})
    else
      records[:core] = CSV.parse(Roo::Spreadsheet.open(source.path).to_csv, headers: true, header_converters: lambda {|f| f&.strip})
    end
    records[:core] = records[:core].map { |r| r.to_h }
    headers[:core] = records[:core].first.to_h.keys
  else
    raise 'Unsupported input format'
  end

  return records, headers
end

#import(max_time, max_records, retry_errored: nil, filters: nil, record_id: nil) ⇒ Hash

Returns the updated dataset records. Do not call if there are changes that have not been persisted

Parameters:

  • max_time (Integer)

    Maximum time to spend processing records.

  • max_records (Integer)

    Maximum number of records to be processed.

  • retry_errored (Boolean) (defaults to: nil)

    Also looks up for errored records when importing (default is looking for records with Status=Ready)

  • filters (Hash) (defaults to: nil)

    (Column-index, value) pairs of filters to apply when searching for records to import (default none)

  • record_id (Integer) (defaults to: nil)

    Indicates the record to be imported (default none). When used filters are ignored.

Returns:

  • (Hash)


124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# File 'app/models/import_dataset/darwin_core.rb', line 124

def import(max_time, max_records, retry_errored: nil, filters: nil, record_id: nil)
  imported = []

  lock_time = Time.now
  old_uuid = self.['import_uuid']
  start_import do
    lock_time = Time.now - lock_time
    filters = self.['import_filters'] if filters.nil?
    retry_errored = self.['import_retry_errored'] if retry_errored.nil?
    start_id = self.['import_start_id'] if retry_errored

    status = ['Ready']
    status << 'Errored' if retry_errored
    records = add_filters(core_records.where(status:), filters).order(:id).limit(max_records) #.preload_fields

    records = records.where(id: start_id..) if start_id
    records = core_records.where(id: record_id, status: %w{Ready Errored}) if record_id

    records = records.all
    start_time = Time.now - lock_time

    dwc_data_attributes = project.preferences['model_predicate_sets'].map do |model, predicate_ids|
      [model, Hash[
        *Predicate.where(id: predicate_ids)
          .select { |p| /^http:\/\/rs\.tdwg\.org\/dwc\/terms\/.*/ =~ p.uri }
          .map {|p| [p.uri.split('/').last, p]}
          .flatten
        ]
      ]
    end.to_h

    records.each do |record|
      imported << record.import(dwc_data_attributes)

      break if 1000.0*(Time.now - start_time).abs > max_time
    end

    if imported.any? && record_id.nil?
      reload
      self..merge!({
        'import_start_id' => imported.last&.id + 1,
        'import_filters' => filters,
        'import_retry_errored' => retry_errored
      })
      save!

      new_uuid = self.['import_uuid']
      ImportDatasetImportJob.perform_later(self, new_uuid, max_time, max_records) unless old_uuid == new_uuid
    else
      self.stop_import
    end
  end

  imported
end

#progress(filters: nil) ⇒ Hash

Returns a hash with the record counts grouped by status

Returns:

  • (Hash)


182
183
184
# File 'app/models/import_dataset/darwin_core.rb', line 182

def progress(filters: nil)
  add_filters(core_records, filters).group(:status).count
end

#return_default_if_absent(value, default) ⇒ Object (private)



309
310
311
312
# File 'app/models/import_dataset/darwin_core.rb', line 309

def return_default_if_absent(value, default)
  return default if value.nil? || value.empty?
  value
end

#set_import_settings(import_settings) ⇒ Object

Sets import settings for this dataset



204
205
206
207
208
209
# File 'app/models/import_dataset/darwin_core.rb', line 204

def set_import_settings(import_settings)
  ['import_settings'] ||= {}
  import_settings.each { |k, v| ['import_settings'].merge!({k => v}) }

  ['import_settings']
end

#stageObject

Stages DwC-A records into DB.



187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# File 'app/models/import_dataset/darwin_core.rb', line 187

def stage
  if status == 'Staging' # ActiveJob being retried could cause this state
    transaction do
      core_records_fields.delete_all
      dataset_records.delete_all
    end
  end

  update!(status: 'Staging') if status == 'Uploaded'

  if status != 'Ready'
    perform_staging
    update!(status: 'Ready')
  end
end

#start_import(&block) ⇒ String

Sets up import dataset for import and returns UUID. If already started same UUID is returned (unless last activity was more than 10 minutes ago). Do not call if there are changes that have not been persisted

Returns:

  • (String)


82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# File 'app/models/import_dataset/darwin_core.rb', line 82

def start_import(&block)
  with_lock do
    case self.status
    when 'Ready'
      self.status = 'Importing'
      self.['import_uuid'] = SecureRandom.uuid
    when 'Importing'
      self.['import_uuid'] = SecureRandom.uuid if self.updated_at < 10.minutes.ago
    else
      raise 'Invalid initial state'
    end
    save!

    yield if block_given?
  end

  self.['import_uuid']
end

#stop_importObject

Sets import dataset to stop importing data. Do not call if there are changes that have not been persisted.



102
103
104
105
106
107
108
109
110
# File 'app/models/import_dataset/darwin_core.rb', line 102

def stop_import
  with_lock do
    if self.status == 'Importing'
      self.status = 'Ready'
      self..except!('import_uuid', 'import_start_id', 'import_filters', 'import_retry_errored')
      save!
    end
  end
end