Class: Export::Dwca::Data

Inherits:
Object
  • Object
show all
Defined in:
lib/export/dwca/data.rb

Overview

!! !! This export does not support AssertedDistribution data at the moment. While those data are indexed, !! if they are in the ‘core_scope` they will almost certainly cause problems or be ignored. !!

Wrapper to build DWCA zipfiles for a specific project. See tasks/accesssions/report/dwc_controller.rb for use.

With help from thinkingeek.com/2013/11/15/create-temporary-zip-file-send-response-rails/

Usage:

begin
 data = Dwca::Data.new(DwcOccurrence.where(project_id: sessions_current_project_id)
ensure
 data.cleanup
end

Always use the ensure/data.cleanup pattern!

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(core_scope: nil, extension_scopes: {}, predicate_extensions: {}, taxonworks_extensions: []) ⇒ Data

Returns a new instance of Data.

Parameters:

  • taxonworks_extensions (Array<Symbol>) (defaults to: [])

    List of methods to perform on each CO

Raises:

  • (ArgumentError)


74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/export/dwca/data.rb', line 74

def initialize(core_scope: nil, extension_scopes: {}, predicate_extensions: {}, taxonworks_extensions: [])
  raise ArgumentError, 'must pass a core_scope' if core_scope.nil?

  @core_scope = core_scope

  @biological_associations_extension = extension_scopes[:biological_associations] #! String
  @media_extension = extension_scopes[:media] #  = get_scope(core_scope)

  @data_predicate_ids = { collection_object_predicate_id: [], collecting_event_predicate_id: [] }.merge(predicate_extensions)

  @taxonworks_extension_methods = taxonworks_extensions
end

Instance Attribute Details

#all_dataObject

Returns Tempfile.

Returns:

  • Tempfile



67
68
69
# File 'lib/export/dwca/data.rb', line 67

def all_data
  @all_data
end

#biological_associations_extensionScope?

Returning BiologicalAssociation

Returns:

  • (Scope, nil)

    Returning BiologicalAssociation



40
41
42
# File 'lib/export/dwca/data.rb', line 40

def biological_associations_extension
  @biological_associations_extension
end

#collection_object_idsObject

TODO Breaks when AssertedDistribution is added



59
60
61
# File 'lib/export/dwca/data.rb', line 59

def collection_object_ids
  @collection_object_ids
end

#core_scopeObject

!params core_scope [String, ActiveRecord::Relation]

String is fully formed SQL


36
37
38
# File 'lib/export/dwca/data.rb', line 36

def core_scope
  @core_scope
end

#dataTempfile

Returns the csv data as a tempfile.

Returns:

  • (Tempfile)

    the csv data as a tempfile



154
155
156
# File 'lib/export/dwca/data.rb', line 154

def data
  @data
end

#data_predicate_idsObject

collection_object_predicate_id: [], collecting_event_predicate_id: []

Returns:

  • Hash



54
55
56
# File 'lib/export/dwca/data.rb', line 54

def data_predicate_ids
  @data_predicate_ids
end

#dwc_id_orderObject

Get order of ids that matches core records so we can align with csv zero! Like 2=>1, 3=>2, 4=>3, 5=>4

Returns:

  • Hash



71
72
73
# File 'lib/export/dwca/data.rb', line 71

def dwc_id_order
  @dwc_id_order
end

#emlTempfile

This is a stub, and only half-heartedly done. You should be using IPT for the time being. See also

https://github.com/gbif/ipt/wiki/
https://github.com/gbif/ipt/wiki/#exemplar-datasets

TODO: reference biological_resource_extension.csv

Returns:

  • (Tempfile)

    metadata about this dataset



536
537
538
# File 'lib/export/dwca/data.rb', line 536

def eml
  @eml
end

#filenameString (readonly)

the name of zipfile

Returns:

  • (String)


710
711
712
# File 'lib/export/dwca/data.rb', line 710

def filename
  @filename
end

#media_extensionScope?

Returns @return Image(?).

Returns:

  • (Scope, nil)

    @return Image(?)



44
45
46
# File 'lib/export/dwca/data.rb', line 44

def media_extension
  @media_extension
end

#metaObject

Returns the value of attribute meta.



30
31
32
# File 'lib/export/dwca/data.rb', line 30

def meta
  @meta
end

#predicate_dataObject

Returns the value of attribute predicate_data.



50
51
52
# File 'lib/export/dwca/data.rb', line 50

def predicate_data
  @predicate_data
end

#taxonworks_extension_dataObject

rubocop:disable Metrics/MethodLength



211
212
213
# File 'lib/export/dwca/data.rb', line 211

def taxonworks_extension_data
  @taxonworks_extension_data
end

#taxonworks_extension_methodsObject

Returns the value of attribute taxonworks_extension_methods.



64
65
66
# File 'lib/export/dwca/data.rb', line 64

def taxonworks_extension_methods
  @taxonworks_extension_methods
end

#totalObject

TODO update



46
47
48
# File 'lib/export/dwca/data.rb', line 46

def total
  @total
end

#zipfileTempfile

Returns the zipfile.

Returns:

  • (Tempfile)

    the zipfile



701
702
703
# File 'lib/export/dwca/data.rb', line 701

def zipfile
  @zipfile
end

Instance Method Details

#biological_associations_resource_relationshipObject

rubocop:enable Metrics/MethodLength



625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
# File 'lib/export/dwca/data.rb', line 625

def biological_associations_resource_relationship
  return nil if biological_associations_extension.nil?
  @biological_associations_resource_relationship = Tempfile.new('biological_resource_relationship.xml')

  content = nil

  if no_records?
    content = "\n"
  else
    content = Export::CSV::Dwc::Extension::BiologicalAssociations.csv(biological_associations_extension)
  end

  @biological_associations_resource_relationship.write(content)
  @biological_associations_resource_relationship.flush
  @biological_associations_resource_relationship.rewind
  @biological_associations_resource_relationship
end

#build_zipObject



682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
# File 'lib/export/dwca/data.rb', line 682

def build_zip
  t = Tempfile.new(filename)

  Zip::OutputStream.open(t) { |zos| }

  Zip::File.open(t.path, Zip::File::CREATE) do |zip|
    zip.add('data.tsv', all_data.path)

    zip.add('media.csv', media.path) if media_extension
    zip.add('resource_relationships.tsv', biological_associations_resource_relationship.path) if biological_associations_extension

    zip.add('meta.xml', meta.path)
    zip.add('eml.xml', eml.path)
  end
  t
end

#cleanupTrue

Returns close and delete all temporary files.

Returns:

  • (True)

    close and delete all temporary files



717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
# File 'lib/export/dwca/data.rb', line 717

def cleanup

  Rails.logger.debug 'dwca_export: cleanup start'

  zipfile.close
  zipfile.unlink
  meta.close
  meta.unlink
  eml.close
  eml.unlink
  data.close
  data.unlink

  if biological_associations_extension
    biological_associations_resource_relationship.close
    biological_associations_resource_relationship.unlink
  end

  if predicate_options_present?
    predicate_data.close
    predicate_data.unlink
  end

  if taxonworks_options_present?
    taxonworks_extension_data.close
    taxonworks_extension_data.unlink
  end

  all_data.close
  all_data.unlink

  Rails.logger.debug 'dwca_export: cleanup end'

  true
end

#collecting_event_attributesObject

@return Array

1 row per CO per DA (type) on CE


380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
# File 'lib/export/dwca/data.rb', line 380

def collecting_event_attributes
  q = "WITH relevant_collection_objects AS (
      #{collection_objects.unscope(:order).select(:id, :collecting_event_id).to_sql}
  )

  SELECT
      relevant_collection_objects.id AS co_id,
      CONCAT('TW:DataAttribute:CollectingEvent:', cvt.name) AS predicate,
      da.value
  FROM
      data_attributes da
      JOIN collecting_events ce ON ce.id = da.attribute_subject_id
           AND da.attribute_subject_type = 'CollectingEvent'
           AND da.type = 'InternalAttribute'
      LEFT JOIN relevant_collection_objects ON ce.id = relevant_collection_objects.collecting_event_id
      JOIN controlled_vocabulary_terms cvt ON cvt.id = da.controlled_vocabulary_term_id
          AND cvt.type = 'Predicate'
  WHERE relevant_collection_objects.id IS NOT null"

  q = q + " AND da.controlled_vocabulary_term_id IN (#{collecting_event_predicate_ids.join(',')})" if collecting_event_predicate_ids.any?

  DataAttribute.connection.execute( q ).collect{|r| [r['co_id'], r['predicate'], r['value']] }
end

#collecting_event_attributes_queryObject

Returns Relation the unique attributes derived from CollectingEvents.

Returns:

  • Relation the unique attributes derived from CollectingEvents



369
370
371
372
373
374
375
376
# File 'lib/export/dwca/data.rb', line 369

def collecting_event_attributes_query
  s = 'WITH touched_collecting_events AS (' + collecting_events.to_sql + ') ' + ::InternalAttribute
    .joins("JOIN touched_collecting_events as tce1 on data_attributes.attribute_subject_id = tce1.id AND data_attributes.attribute_subject_type = 'CollectingEvent'")
    .where(controlled_vocabulary_term_id: collecting_event_predicate_ids)
    .to_sql

  ::InternalAttribute.from('(' + s + ') as data_attributes')
end

#collecting_event_predicate_idsObject



105
106
107
# File 'lib/export/dwca/data.rb', line 105

def collecting_event_predicate_ids
  @data_predicate_ids[:collecting_event_predicate_id]
end

#collecting_eventsObject

rubocop:enable Metrics/MethodLength



327
328
329
330
331
332
333
334
# File 'lib/export/dwca/data.rb', line 327

def collecting_events
  s = 'WITH co_scoped AS (' + collection_objects.unscope(:order).select(:id, :collecting_event_id).to_sql + ') ' + ::CollectingEvent
    .joins('JOIN co_scoped as co_scoped1 on co_scoped1.collecting_event_id = collecting_events.id')
    .distinct
    .to_sql

  ::CollectingEvent.from('(' + s + ') as collecting_events')
end

#collection_object_attributesObject



347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
# File 'lib/export/dwca/data.rb', line 347

def collection_object_attributes
  q = "WITH relevant_collection_objects AS (
      #{collection_objects.unscope(:order).select(:id).to_sql}
  )
  SELECT da.id, da.attribute_subject_id,
         CONCAT('TW:DataAttribute:CollectionObject:', cvt.name) AS predicate,
         da.value,
         da.controlled_vocabulary_term_id
  FROM data_attributes da
  JOIN relevant_collection_objects rco ON da.attribute_subject_id = rco.id
                                       AND da.attribute_subject_type = 'CollectionObject'
  JOIN controlled_vocabulary_terms cvt ON cvt.id = da.controlled_vocabulary_term_id
                                       AND cvt.type = 'Predicate'
  WHERE da.type = 'InternalAttribute'"

  q = q + " AND da.controlled_vocabulary_term_id IN (#{collection_object_predicate_ids.join(',')})" if collection_object_predicate_ids.any?

  DataAttribute.connection.execute( q ).collect{|r| [r['attribute_subject_id'], r['predicate'], r['value']] }
end

#collection_object_attributes_queryObject



336
337
338
339
340
341
342
343
344
345
# File 'lib/export/dwca/data.rb', line 336

def collection_object_attributes_query
  s = 'WITH touched_collection_objects AS (' + collection_objects.unscope(:order).select(:id).to_sql + ') ' + ::InternalAttribute
    .joins("JOIN touched_collection_objects as tco1 on data_attributes.attribute_subject_id = tco1.id AND data_attributes.attribute_subject_type = 'CollectionObject'")
    .to_sql

  ::InternalAttribute
    .joins(:predicate)
    .where(controlled_vocabulary_term_id: collection_object_predicate_ids)
    .from('(' + s + ') as data_attributes')
end

#collection_object_predicate_idsObject



101
102
103
# File 'lib/export/dwca/data.rb', line 101

def collection_object_predicate_ids
  @data_predicate_ids[:collection_object_predicate_id]
end

#collection_objectsObject



404
405
406
407
408
409
410
411
# File 'lib/export/dwca/data.rb', line 404

def collection_objects
  s = 'WITH dwc_scoped AS (' + core_scope.unscope(:order).select('dwc_occurrences.dwc_occurrence_object_id, dwc_occurrences.dwc_occurrence_object_type').to_sql + ') ' + ::CollectionObject
    .joins("JOIN dwc_scoped as dwc_scoped1 on dwc_scoped1.dwc_occurrence_object_id = collection_objects.id and dwc_scoped1.dwc_occurrence_object_type = 'CollectionObject'")
    .select(:id, :collecting_event_id, :type)
    .to_sql

  ::CollectionObject.from('(' + s + ') as collection_objects')
end

#csvCSV

Returns the data as a CSV object.

Returns:

  • (CSV)

    the data as a CSV object



134
135
136
137
138
139
140
141
142
143
144
# File 'lib/export/dwca/data.rb', line 134

def csv
  ::Export::CSV.generate_csv(
    core_scope.computed_columns,
    # TODO: check to see if we nee dthis
    exclude_columns: ::DwcOccurrence.excluded_columns,
    column_order: ::CollectionObject::DWC_OCCURRENCE_MAP.keys + ::CollectionObject::EXTENSION_FIELDS, # TODO: add other maps here
    trim_columns: true, # going to have to be optional
    trim_rows: false,
    header_converters: [:dwc_headers]
  )
end

#extension_computed_fields_data(methods) ⇒ Object

TODO: return, or optimize to this when ::CollectionObject::EXTENSION_COMPUTED_FIELDS.size > 1 def extension_computed_fields_data(methods)

d = []
collection_objects.find_each do |object|
  methods.each_pair { |method, name| d  << [object.id, name, object.send(method)] }
end
d

end

!! This will have to be reverted to above when > 1 EXTENSION field is present



194
195
196
197
198
199
200
201
202
203
204
205
206
207
# File 'lib/export/dwca/data.rb', line 194

def extension_computed_fields_data(methods)
  return [] if methods.empty?

  a = "TW:Internal:otu_name".freeze

  # n = "COALESCE( otus.name, TRIM(CONCAT(cached, ' ', cached_author_year))) as otu_name"

  v = collection_objects.left_joins(otu: [:taxon_name])
    .select("collection_objects.id, otus.name as otu_name")
    .where(taxon_determinations: {position: '1'})
    .find_each(batch_size: 10000)
    .collect{|r| [r.id, a, r['otu_name'].presence] }
  v
end

#meta_fieldsArray

id, and non-standard DwC colums are handled elsewhere

Returns:

  • (Array)

    use the temporarily written, and refined, CSV file to read off the existing headers so we can use them in writing meta.yml



647
648
649
650
651
652
# File 'lib/export/dwca/data.rb', line 647

def meta_fields
  return [] if no_records?
  h = File.open(all_data, &:gets)&.strip&.split("\t")
  h&.shift
  h || []
end

#no_records?Boolean

Returns true if provided core_scope returns no records.

Returns:

  • (Boolean)

    true if provided core_scope returns no records



148
149
150
# File 'lib/export/dwca/data.rb', line 148

def no_records?
  total == 0
end

#package_download(download) ⇒ Download

Returns a download instance.

Parameters:

Returns:



755
756
757
758
# File 'lib/export/dwca/data.rb', line 755

def package_download(download)
  download.update!(source_file_path: zipfile.path)
  download
end

#predicate_options_present?Boolean

Returns:

  • (Boolean)


120
121
122
# File 'lib/export/dwca/data.rb', line 120

def predicate_options_present?
  data_predicate_ids[:collection_object_predicate_id].present? || data_predicate_ids[:collecting_event_predicate_id].present?
end

#taxonworks_options_present?Boolean

Returns:

  • (Boolean)


124
125
126
# File 'lib/export/dwca/data.rb', line 124

def taxonworks_options_present?
  taxonworks_extension_methods.present?
end

#used_collecting_event_predicatesObject



419
420
421
422
423
# File 'lib/export/dwca/data.rb', line 419

def used_collecting_event_predicates
  collecting_event_attributes_query.joins(:predicate).select("CONCAT('TW:DataAttribute:CollectingEvent:', controlled_vocabulary_terms.name) predicate_name")
    .distinct
    .collect{|r| r['predicate_name']}
end

#used_collection_object_predicatesObject



413
414
415
416
417
# File 'lib/export/dwca/data.rb', line 413

def used_collection_object_predicates
  collection_object_attributes_query.select("CONCAT('TW:DataAttribute:CollectionObject:', controlled_vocabulary_terms.name) predicate_name")
    .distinct
    .collect{|r| r['predicate_name']}
end

#used_predicatesArray

Returns of distinct Predicate names in the format

`TW:DataAttribute:<CollectingEvent|CollectionObject>:<name>`.

Returns:

  • (Array)

    of distinct Predicate names in the format

    `TW:DataAttribute:<CollectingEvent|CollectionObject>:<name>`
    


428
429
430
# File 'lib/export/dwca/data.rb', line 428

def used_predicates
  used_collection_object_predicates + used_collecting_event_predicates
end