Module: Export::Coldp

Defined in:
lib/export/coldp.rb

Overview

Exports to the Catalog of Life in the new “coldp” format. api.col.plus/datapackage

  • write tests to check for coverage (missing methods)

  • Update all files formats to use tabs

  • Pending handling of both BibTeX and Verbatim

Constant Summary collapse

FILETYPES =
%w{Distribution Name NameRelation SpeciesInteraction Synonym TaxonConceptRelation TypeMaterial VernacularName Taxon References}.freeze

Class Method Summary collapse

Class Method Details

.basionym_id(taxon_name) ⇒ Object

Parameters:

  • taxon_name (a valid Protonym or a Combination)

    see also exclusion of OTUs/Names based on Ranks not handled



211
212
213
214
215
216
217
218
219
# File 'lib/export/coldp.rb', line 211

def self.basionym_id(taxon_name)
  if taxon_name.type == 'Protonym'
    taxon_name.reified_id
  elsif taxon_name.type == 'Combination'
    taxon_name.protonyms.last.reified_id
  else
    nil
  end
end

.download(otu, request = nil, prefer_unlabelled_otus: true) ⇒ Object



169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/export/coldp.rb', line 169

def self.download(otu, request = nil, prefer_unlabelled_otus: true)
  file_path = ::Export::Coldp.export(
    otu.id,
    prefer_unlabelled_otus:
  )
  name = "coldp_otu_id_#{otu.id}_#{DateTime.now}.zip"

  ::Download::Coldp.create!(
    name: "ColDP Download for #{otu.otu_name} on #{Time.now}.",
    description: 'A zip file containing CoLDP formatted data.',
    filename: filename(otu),
    source_file_path: file_path,
    request:,
    expires: 5.days.from_now
  )
end

.download_async(otu, request = nil, prefer_unlabelled_otus: true) ⇒ Object



186
187
188
189
190
191
192
193
194
195
196
197
198
# File 'lib/export/coldp.rb', line 186

def self.download_async(otu, request = nil, prefer_unlabelled_otus: true)
  download = ::Download::Coldp.create!(
    name: "ColDP Download for #{otu.otu_name} on #{Time.now}.",
    description: 'A zip file containing CoLDP formatted data.',
    filename: filename(otu),
    request:,
    expires: 5.days.from_now
  )

  ColdpCreateDownloadJob.perform_later(otu, download, prefer_unlabelled_otus:)

  download
end

.export(otu_id, prefer_unlabelled_otus: true) ⇒ Object

Return path to the data itself

TODO: mode: taxon_name_proxy



65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# File 'lib/export/coldp.rb', line 65

def self.export(otu_id, prefer_unlabelled_otus: true)
  otus = otus(otu_id)

  # source_id: [csv_array]
  ref_tsv = {}

  otu = ::Otu.find(otu_id)

  # check for a clb_dataset_id identifier
  ns = Namespace.find_by(institution: 'ChecklistBank', name: 'clb_dataset_id')
  clb_dataset_id =  otu.identifiers.where(namespace_id: ns.id)&.first&.identifier unless ns.nil?

  project = ::Project.find(otu.project_id)
  project_members = project_members(project.id)
  feedback_url = project[:data_curation_issue_tracker_url] unless project[:data_curation_issue_tracker_url].nil?

  # TODO: This will likely have to change, it is renamed on serving the file.
  zip_file_path = "/tmp/_#{SecureRandom.hex(8)}_coldp.zip"

   = Zaru::sanitize!("/tmp/#{project.name}_#{DateTime.now}_metadata.yaml").gsub(' ', '_').downcase
  version = TaxonWorks::VERSION
  if Settings.sandbox_mode?
    version = Settings.sandbox_commit_sha
  end

  # We lose the ability to maintain title in TW but until we can model metadata in TW, 
  #   it seems desirable because there's a lot of TW vs CLB title mismatches
  if clb_dataset_id.nil?
     = {
      'title' => project.name,
      'issued' => DateTime.now.strftime('%Y-%m-%d'),
      'version' => DateTime.now.strftime('%b %Y'),
      'feedbackUrl' => feedback_url
    }
  else
     = Colrapi.dataset(dataset_id: clb_dataset_id) unless clb_dataset_id.nil?

    # remove fields maintained by ChecklistBank or TW
    exclude_fields = %w[created createdBy modified modifiedBy attempt imported lastImportAttempt lastImportState size label citation private platform]
     = .except(*exclude_fields)

    # put feedbackUrl before the contact email in the metadata file to encourage use of the issue tracker
     = {}
    .each do |key, value|
      if key == 'contact'
        ['feedbackUrl'] = feedback_url
      end
      [key] = value
    end
     = 
  end

  ['issued'] = DateTime.now.strftime('%Y-%m-%d')
  ['version'] = DateTime.now.strftime('%b %Y')

  platform = {
      'name' => 'TaxonWorks',
      'alias' => 'TW',
      'version' => version
  }
  ['platform'] = platform

   = Tempfile.new()
  .write(.to_yaml)
  .close

  Zip::File.open(zip_file_path, Zip::File::CREATE) do |zipfile|

    (FILETYPES - %w{Name Taxon References Synonym}).each do |ft| # TODO: double check Synonym belongs there.
      m = "Export::Coldp::Files::#{ft}".safe_constantize
      zipfile.get_output_stream("#{ft}.tsv") { |f| f.write m.generate(otus, project_members, ref_tsv) }
    end

    zipfile.get_output_stream('Name.tsv') { |f| f.write Export::Coldp::Files::Name.generate(otu, project_members, ref_tsv) }

    skip_name_ids = Export::Coldp::Files::Name.skipped_name_ids
    zipfile.get_output_stream("Synonym.tsv") { |f| f.write Export::Coldp::Files::Synonym.generate(otus, project_members, ref_tsv, skip_name_ids) }

    zipfile.get_output_stream('Taxon.tsv') do |f|
      f.write Export::Coldp::Files::Taxon.generate(otus, project_members, otu_id, ref_tsv, prefer_unlabelled_otus, skip_name_ids)
    end

    # TODO: this doesn't really help, and adds time to the process.
    # Sort the refs by full citation string
    sorted_refs = ref_tsv.values.sort{|a,b| a[1] <=> b[1]}

    d = ::CSV.generate(col_sep: "\t") do |tsv|
      tsv << %w{ID citation	doi modified modifiedBy} # author year source details
      sorted_refs.each do |r|
        tsv << r
      end
    end

    zipfile.get_output_stream('References.tsv') { |f| f.write d }
    zipfile.add('metadata.yaml', .path)
  end

  zip_file_path
end

.filename(otu) ⇒ Object



165
166
167
# File 'lib/export/coldp.rb', line 165

def self.filename(otu)
  Zaru::sanitize!("#{::Project.find(otu.project_id).name}_coldp_otu_id_#{otu.id}_#{DateTime.now}.zip").gsub(' ', '_').downcase
end

.modified(updated_at) ⇒ Object



44
45
46
47
48
49
50
# File 'lib/export/coldp.rb', line 44

def self.modified(updated_at)
  if updated_at.nil?
    ''
  else
    updated_at&.iso8601
  end
end

.modified_by(updated_by_id, project_members) ⇒ Object



52
53
54
# File 'lib/export/coldp.rb', line 52

def self.modified_by(updated_by_id, project_members)
  project_members[updated_by_id]
end

.original_field(taxon_name) ⇒ Boolean

Returns ‘true` if no parens in `cached_author_year` `false` if parens in `cached_author_year`.

Returns:

  • (Boolean)

    ‘true` if no parens in `cached_author_year` `false` if parens in `cached_author_year`



205
206
207
# File 'lib/export/coldp.rb', line 205

def self.original_field(taxon_name)
  (taxon_name.type == 'Protonym') && taxon_name.is_original_name?
end

.otus(otu_id) ⇒ Scope

Returns A full set of valid only OTUs (= Taxa in CoLDP) that are to be sent. !! At present no OTU with a ‘name` is sent. In the future this may need to change.

!! No synonym TaxonName is send if they don’t have an OTU.

Returns:

  • (Scope)

    A full set of valid only OTUs (= Taxa in CoLDP) that are to be sent. !! At present no OTU with a ‘name` is sent. In the future this may need to change.

    !! No synonym TaxonName is send if they don’t have an OTU



22
23
24
25
26
27
28
29
30
# File 'lib/export/coldp.rb', line 22

def self.otus(otu_id)
  o = ::Otu.find(otu_id)
  return ::Otu.none if o.taxon_name_id.nil?

  Otu.joins(taxon_name: [:ancestor_hierarchies])
    .where(taxon_names: {cached_is_valid: true} )
    .where('taxon_name_hierarchies.ancestor_id = ?', o.taxon_name_id)
    .where('(otus.name IS NULL) OR (otus.name = taxon_names.cached)') # !! Union does not make this faster
end

.project_members(project_id) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
# File 'lib/export/coldp.rb', line 32

def self.project_members(project_id)
  project_members = {}
  ProjectMember.where(project_id:).each do |pm|
    if pm.user.orcid.nil?
      project_members[pm.user_id] = pm.user.name
    else
      project_members[pm.user_id] = pm.user.orcid
    end
  end
  project_members
end

.reified_id(taxon_name_id, cached, cached_original_combination) ⇒ Object

Replicate TaxonName.refified_id. This is here because we pluck to arrays in the dump. It’s janky. TODO: try to eliminate, at minimum test for consistency?



224
225
226
227
228
229
230
231
# File 'lib/export/coldp.rb', line 224

def self.reified_id(taxon_name_id, cached, cached_original_combination)
  # Protonym#has_alternate_original?
  if cached_original_combination && (cached != cached_original_combination)
    taxon_name_id.to_s + '-' + Digest::MD5.hexdigest(cached_original_combination)
  else
    taxon_name_id
  end
end

.sanitize_remarks(remarks) ⇒ Object

TODO: Move to Strings::Utilities



57
58
59
# File 'lib/export/coldp.rb', line 57

def self.sanitize_remarks(remarks)
  remarks&.gsub('\r\n', ' ')&.gsub('\n', ' ')&.gsub('\t', ' ')&.gsub(/[ ]+/, ' ')
end