Module: Export::Coldp

Defined in:
lib/export/coldp.rb

Overview

Exports to the Catalog of Life in the new “coldp” format. api.col.plus/datapackage

  • write tests to check for coverage (missing methods)

  • Update all files formats to use tabs

  • Pending handling of both BibTeX and Verbatim

Constant Summary collapse

FILETYPES =
%w{Distribution Name NameRelation SpeciesInteraction Synonym TaxonConceptRelation TypeMaterial VernacularName Taxon References}.freeze

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Class Attribute Details

.skipped_combinationsArray

name.tsv level to the synonym.tsv level. Could be replaced when Combination can use SQL to determine the rank of the name Combination applies to. !! Should just cache this.

Returns:

  • (Array)

    used to pass along inference at the



26
27
28
# File 'lib/export/coldp.rb', line 26

def skipped_combinations
  @skipped_combinations
end

Instance Attribute Details

#remarksObject

TODO: probably doing nothing



33
34
35
# File 'lib/export/coldp.rb', line 33

def remarks
  @remarks
end

Class Method Details

.basionym_id(taxon_name) ⇒ Object

Parameters:

  • taxon_name (a valid Protonym or a Combination)

    see also exclusion of OTUs/Names based on Ranks not handled



269
270
271
272
273
274
275
276
277
# File 'lib/export/coldp.rb', line 269

def self.basionym_id(taxon_name)
  if taxon_name.type == 'Protonym'
    taxon_name.reified_id
  elsif taxon_name.type == 'Combination'
    taxon_name.protonyms.last.reified_id
  else
    nil
  end
end

.download(otu, request = nil, prefer_unlabelled_otus: true) ⇒ Object



226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/export/coldp.rb', line 226

def self.download(otu, request = nil, prefer_unlabelled_otus: true)
  file_path = ::Export::Coldp.export(
    otu.id,
    prefer_unlabelled_otus:
  )
  name = "coldp_otu_id_#{otu.id}_#{DateTime.now}.zip"

  ::Download::Coldp.create!(
    name: "ColDP Download for #{otu.otu_name} on #{Time.now}.",
    description: 'A zip file containing CoLDP formatted data.',
    filename: filename(otu),
    source_file_path: file_path,
    request:,
    expires: 5.days.from_now
  )
end

.download_async(otu, request = nil, prefer_unlabelled_otus: true) ⇒ Object



243
244
245
246
247
248
249
250
251
252
253
254
255
# File 'lib/export/coldp.rb', line 243

def self.download_async(otu, request = nil, prefer_unlabelled_otus: true)
  download = ::Download::Coldp.create!(
    name: "ColDP Download for #{otu.otu_name} on #{Time.now}.",
    description: 'A zip file containing CoLDP formatted data.',
    filename: filename(otu),
    request:,
    expires: 5.days.from_now
  )

  ColdpCreateDownloadJob.perform_later(otu, download, prefer_unlabelled_otus:)

  download
end

.export(otu_id, prefer_unlabelled_otus: true) ⇒ Object

Return path to the data itself

TODO: mode: taxon_name_proxy



114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# File 'lib/export/coldp.rb', line 114

def self.export(otu_id, prefer_unlabelled_otus: true)
  otus = otus(otu_id)

  # source_id: [csv_array]
  ref_tsv = {}

  otu = ::Otu.find(otu_id)

  # check for a clb_dataset_id identifier
  ns = Namespace.find_by(institution: 'ChecklistBank', name: 'clb_dataset_id')
  clb_dataset_id =  otu.identifiers.where(namespace_id: ns.id)&.first&.identifier unless ns.nil?

  project = ::Project.find(otu.project_id)

  project_id = otu.project_id

  project_members = project_members(project.id)
  feedback_url = project[:data_curation_issue_tracker_url] unless project[:data_curation_issue_tracker_url].nil?

  # TODO: This will likely have to change, it is renamed on serving the file.
  zip_file_path = "/tmp/_#{SecureRandom.hex(8)}_coldp.zip"

   = Zaru::sanitize!("/tmp/#{project.name}_#{DateTime.now}_metadata.yaml").gsub(' ', '_').downcase
  version = TaxonWorks::VERSION
  if Settings.sandbox_mode?
    version = Settings.sandbox_commit_sha
  end

  # We lose the ability to maintain title in TW but until we can model metadata in TW,
  #   it seems desirable because there's a lot of TW vs CLB title mismatches
  if clb_dataset_id.nil?
     = {
      'title' => project.name,
      'issued' => DateTime.now.strftime('%Y-%m-%d'),
      'version' => DateTime.now.strftime('%b %Y'),
      'feedbackUrl' => feedback_url
    }
  else
     = Colrapi.dataset(dataset_id: clb_dataset_id) unless clb_dataset_id.nil?

    # remove fields maintained by ChecklistBank or TW
    exclude_fields = %w[created createdBy modified modifiedBy attempt imported lastImportAttempt lastImportState size label citation private platform]
     = .except(*exclude_fields)

    # put feedbackUrl before the contact email in the metadata file to encourage use of the issue tracker
     = {}
    .each do |key, value|
      if key == 'contact'
        ['feedbackUrl'] = feedback_url
      end
      [key] = value
    end
     = 
  end

  ['issued'] = DateTime.now.strftime('%Y-%m-%d')
  ['version'] = DateTime.now.strftime('%b %Y')

  platform = {
    'name' => 'TaxonWorks',
    'alias' => 'TW',
    'version' => version
  }
  ['platform'] = platform

   = Tempfile.new()
  .write(.to_yaml)
  .close

  Zip::File.open(zip_file_path, Zip::File::CREATE) do |zipfile|

    zipfile.get_output_stream('Name.tsv') { |f| f.write Export::Coldp::Files::Name.generate(otu, project_members, ref_tsv) }

    zipfile.get_output_stream("Synonym.tsv") { |f| f.write Export::Coldp::Files::Synonym.generate(otu, otus, project_members, ref_tsv) }

    zipfile.get_output_stream('Taxon.tsv') do |f|
      f.write Export::Coldp::Files::Taxon.generate(otu, otus, project_members, ref_tsv, prefer_unlabelled_otus)
    end

    zipfile.get_output_stream('TypeMaterial.tsv') { |f| f.write Export::Coldp::Files::TypeMaterial.generate(otu, project_members, ref_tsv) }

    (FILETYPES - %w{Name Taxon References Synonym TypeMaterial}).each do |ft|
      m = "Export::Coldp::Files::#{ft}".safe_constantize
      zipfile.get_output_stream("#{ft}.tsv") { |f| f.write m.generate(otus, project_members, ref_tsv) }
    end

    # TODO: Probably not used
    # skip_name_ids = Export::Coldp::Files::Name.skipped_name_ids  ||  []


    # TODO: this doesn't really help, and adds time to the process.
    # Sort the refs by full citation string
    sorted_refs = ref_tsv.values.sort{|a,b| a[1] <=> b[1]}

    d = ::CSV.generate(col_sep: "\t") do |tsv|
      tsv << %w{ID citation	doi modified modifiedBy} # author year source details
      sorted_refs.each do |r|
        tsv << r
      end
    end

    zipfile.get_output_stream('References.tsv') { |f| f.write d }
    zipfile.add('metadata.yaml', .path) # TODO: consider isolating Files.metadata logic
  end

  zip_file_path
end

.filename(otu) ⇒ Object



222
223
224
# File 'lib/export/coldp.rb', line 222

def self.filename(otu)
  Zaru::sanitize!("#{::Project.find(otu.project_id).name}_coldp_otu_id_#{otu.id}_#{DateTime.now}.zip").gsub(' ', '_').downcase
end

.get_remarks(scope, predicate_id) ⇒ Object

TODO: We are using ‘,` to delimit values elsewhere (e.g. Taxon) TODO: find by IRI, not predicate_id so that we can unify the vocabulary. Accessed per file type



67
68
69
70
71
72
73
74
75
# File 'lib/export/coldp.rb', line 67

def self.get_remarks(scope, predicate_id)
  c = DataAttribute.with(invalid_names: scope.select('taxon_names.id invalid_id'))
    .joins("JOIN invalid_names ON invalid_names.invalid_id = data_attributes.attribute_subject_id AND data_attributes.attribute_subject_type = 'TaxonName'")
    .select("data_attributes.attribute_subject_id, STRING_AGG(data_attributes.value, '|') AS values")
    .where(data_attributes: {controlled_vocabulary_term_id: predicate_id})
    .group('data_attributes.attribute_subject_id')

  ApplicationRecord.connection.execute(c.to_sql).to_a
end

.modified(updated_at) ⇒ Object



89
90
91
92
93
94
95
# File 'lib/export/coldp.rb', line 89

def self.modified(updated_at)
  if updated_at.nil?
    ''
  else
    updated_at&.iso8601
  end
end

.modified_by(updated_by_id, project_members) ⇒ Object



97
98
99
# File 'lib/export/coldp.rb', line 97

def self.modified_by(updated_by_id, project_members)
  project_members[updated_by_id]
end

.original_field(taxon_name) ⇒ Boolean

Doesn’t exist in ColDP

Returns:

  • (Boolean)

    ‘true` if no parens in `cached_author_year` `false` if parens in `cached_author_year`



263
264
265
# File 'lib/export/coldp.rb', line 263

def self.original_field(taxon_name)
  (taxon_name.type == 'Protonym') && taxon_name.is_original_name?
end

.otus(otu_id) ⇒ Scope

Returns A full set of valid only OTUs (= Taxa in CoLDP) that are to be sent. !! At present no OTU with a ‘name` is sent. In the future this may need to change.

!! No synonym TaxonName is send if they don’t have an OTU

This is presently not scoping Names.csv. That’s probably OK.

Returns:

  • (Scope)

    A full set of valid only OTUs (= Taxa in CoLDP) that are to be sent. !! At present no OTU with a ‘name` is sent. In the future this may need to change.

    !! No synonym TaxonName is send if they don’t have an OTU

    This is presently not scoping Names.csv. That’s probably OK.



54
55
56
57
58
59
60
61
62
# File 'lib/export/coldp.rb', line 54

def self.otus(otu_id)
  o = ::Otu.find(otu_id)
  return ::Otu.none if o.taxon_name_id.nil?

  Otu.joins(taxon_name: [:ancestor_hierarchies])
    .where(taxon_names: {cached_is_valid: true} )
    .where('taxon_name_hierarchies.ancestor_id = ?', o.taxon_name_id)
    .where('(otus.name IS NULL) OR (otus.name = taxon_names.cached)') # !! Union does not make this faster
end

.project_members(project_id) ⇒ Object



77
78
79
80
81
82
83
84
85
86
87
# File 'lib/export/coldp.rb', line 77

def self.project_members(project_id)
  project_members = {}
  ProjectMember.eager_load(:user).where(project_id:).each do |pm|
    if pm.user.orcid.nil?
      project_members[pm.user_id] = pm.user.name
    else
      project_members[pm.user_id] = pm.user.orcid
    end
  end
  project_members
end

.reified_id(taxon_name_id, cached, cached_original_combination) ⇒ Object

Replicate TaxonName.refified_id. This is here because we pluck to arrays in the dump. It’s janky. TODO: try to eliminate, at minimum test for consistency?



282
283
284
285
286
287
288
289
# File 'lib/export/coldp.rb', line 282

def self.reified_id(taxon_name_id, cached, cached_original_combination)
  # Protonym#has_alternate_original?
  if cached_original_combination && (cached != cached_original_combination)
    taxon_name_id.to_s + '-' + Digest::MD5.hexdigest(cached_original_combination)
  else
    taxon_name_id
  end
end

.remarks=(values) ⇒ Object



40
41
42
# File 'lib/export/coldp.rb', line 40

def self.remarks=(values)
  @remarks = values
end

.sanitize_remarks(id) ⇒ Object

TODO: Move to Strings::Utilities



102
103
104
105
106
107
108
# File 'lib/export/coldp.rb', line 102

def self.sanitize_remarks(id)
  return nil if @remarks.blank?
  v = @remarks.bsearch{|i| i['attribute_subject_id'] >= id}
  return nil if v['attribute_subject_id'] != id # bsearch finds >=
  return v['values']&.gsub('\r\n', ' ')&.gsub('\n', ' ')&.gsub('\t', ' ')&.gsub(/[ ]+/, ' ')   if v
  nil
end