Class: Export::Dwca::Data

Inherits:
Object
  • Object
show all
Defined in:
lib/export/dwca/data.rb

Overview

!! !! This export does not support AssertedDistribution data at the moment. While those data are indexed, !! if they are in the ‘core_scope` they will almost certainly cause problems or be ignored. !!

Wrapper to build DWCA zipfiles for a specific project. See tasks/accesssions/report/dwc_controller.rb for use.

With help from thinkingeek.com/2013/11/15/create-temporary-zip-file-send-response-rails/

Usage:

begin
 data = Dwca::Data.new(DwcOccurrence.where(project_id: sessions_current_project_id)
ensure
 data.cleanup
end

Always use the ensure/data.cleanup pattern!

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(core_scope: nil, extension_scopes: {}, predicate_extensions: {}, eml_data: {}, taxonworks_extensions: []) ⇒ Data

Returns a new instance of Data.

Parameters:

  • taxonworks_extensions (Array<Symbol>) (defaults to: [])

    List of methods to perform on each CO

Raises:

  • (ArgumentError)


84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/export/dwca/data.rb', line 84

def initialize(core_scope: nil, extension_scopes: {}, predicate_extensions: {}, eml_data: {}, taxonworks_extensions: [])
  raise ArgumentError, 'must pass a core_scope' if core_scope.nil?

  @core_scope = core_scope

  @biological_associations_extension = extension_scopes[:biological_associations] #! Hash with keys core_params, collection_objects_query
  @media_extension = extension_scopes[:media] #! Hash with keys collection_objects, field_occurrences

  @data_predicate_ids = { collection_object_predicate_id: [], collecting_event_predicate_id: [] }.merge(predicate_extensions)

  @eml_data = eml_data

  @taxonworks_extension_methods = taxonworks_extensions
end

Instance Attribute Details

#all_dataObject

Returns Tempfile.

Returns:

  • Tempfile



77
78
79
# File 'lib/export/dwca/data.rb', line 77

def all_data
  @all_data
end

#biological_associations_extensionHash

Returns of collection_objects: query_string, field_occurrences: query_string.

Returns:

  • (Hash)

    of collection_objects: query_string, field_occurrences: query_string



51
52
53
# File 'lib/export/dwca/data.rb', line 51

def biological_associations_extension
  @biological_associations_extension
end

#collection_object_idsObject

TODO Breaks when AssertedDistribution is added



69
70
71
# File 'lib/export/dwca/data.rb', line 69

def collection_object_ids
  @collection_object_ids
end

#core_scopeObject

!params core_scope [String, ActiveRecord::Relation]

String is fully formed SQL


48
49
50
# File 'lib/export/dwca/data.rb', line 48

def core_scope
  @core_scope
end

#dataTempfile

Returns the csv data as a tempfile.

Returns:

  • (Tempfile)

    the csv data as a tempfile



195
196
197
# File 'lib/export/dwca/data.rb', line 195

def data
  @data
end

#data_predicate_idsObject

collection_object_predicate_id: [], collecting_event_predicate_id: []

Returns:

  • Hash



64
65
66
# File 'lib/export/dwca/data.rb', line 64

def data_predicate_ids
  @data_predicate_ids
end

#dwc_id_orderObject

Get order of ids that matches core records so we can align with csv zero! Like 2=>1, 3=>2, 4=>3, 5=>4

Returns:

  • Hash



81
82
83
# File 'lib/export/dwca/data.rb', line 81

def dwc_id_order
  @dwc_id_order
end

#emlTempfile

This is a stub, and only half-heartedly done. You should be using IPT for the time being. See also

https://github.com/gbif/ipt/wiki/
https://github.com/gbif/ipt/wiki/#exemplar-datasets

TODO: reference biological_resource_extension.csv

Returns:

  • (Tempfile)

    metadata about this dataset



589
590
591
# File 'lib/export/dwca/data.rb', line 589

def eml
  @eml
end

#eml_dataHash

for use in construction of the eml file.

Returns:

  • (Hash)

    containing dataset and additional_metadata, as xml strings,



38
39
40
# File 'lib/export/dwca/data.rb', line 38

def eml_data
  @eml_data
end

#filenameString (readonly)

the name of zipfile

Returns:

  • (String)


775
776
777
# File 'lib/export/dwca/data.rb', line 775

def filename
  @filename
end

#media_extensionHash

Returns of collection_objects: query_string, field_occurrences: query_string.

Returns:

  • (Hash)

    of collection_objects: query_string, field_occurrences: query_string



54
55
56
# File 'lib/export/dwca/data.rb', line 54

def media_extension
  @media_extension
end

#metaObject

Returns the value of attribute meta.



42
43
44
# File 'lib/export/dwca/data.rb', line 42

def meta
  @meta
end

#predicate_dataObject

Returns the value of attribute predicate_data.



60
61
62
# File 'lib/export/dwca/data.rb', line 60

def predicate_data
  @predicate_data
end

#taxonworks_extension_dataObject

rubocop:disable Metrics/MethodLength



252
253
254
# File 'lib/export/dwca/data.rb', line 252

def taxonworks_extension_data
  @taxonworks_extension_data
end

#taxonworks_extension_methodsObject

Returns the value of attribute taxonworks_extension_methods.



74
75
76
# File 'lib/export/dwca/data.rb', line 74

def taxonworks_extension_methods
  @taxonworks_extension_methods
end

#totalObject

TODO update



56
57
58
# File 'lib/export/dwca/data.rb', line 56

def total
  @total
end

#zipfileTempfile

Returns the zipfile.

Returns:

  • (Tempfile)

    the zipfile



766
767
768
# File 'lib/export/dwca/data.rb', line 766

def zipfile
  @zipfile
end

Instance Method Details

#biological_association_relations_to_coreObject

rubocop:enable Metrics/MethodLength



608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
# File 'lib/export/dwca/data.rb', line 608

def biological_association_relations_to_core
  core_params = {
    dwc_occurrence_query: @biological_associations_extension[:core_params]
  }

  subject_biological_associations =
    ::Queries::BiologicalAssociation::Filter.new(
      collection_object_query: core_params,
      collection_object_as_subject_or_as_object: :subject
    ).all

  object_biological_associations =
    ::Queries::BiologicalAssociation::Filter.new(
      collection_object_query: core_params,
      collection_object_as_subject_or_as_object: :object
    ).all

  {
    subject: Set.new(subject_biological_associations.pluck(:id)),
    object: Set.new(object_biological_associations.pluck(:id))
  }
end

#biological_associations_resource_relationship_tmpObject



631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
# File 'lib/export/dwca/data.rb', line 631

def biological_associations_resource_relationship_tmp
  return nil if biological_associations_extension.nil?
  @biological_associations_resource_relationship_tmp = Tempfile.new('biological_resource_relationship.xml')

  content = nil

  if no_records?
    content = "\n"
  else
    benchmark_result = Benchmark.measure do
      content = Export::CSV::Dwc::Extension::BiologicalAssociations.csv(biological_associations_extension, biological_association_relations_to_core)
    end

    timestamp = Time.now.strftime('%Y%m%d_%H%M%S')
    File.open("/tmp/dwc_resource_relationship_benchmark_#{timestamp}.txt", 'a') do |f|
      f.puts "#{Time.now} - BiologicalAssociations CSV export: #{benchmark_result}"
    end
  end

  @biological_associations_resource_relationship_tmp.write(content)
  @biological_associations_resource_relationship_tmp.flush
  @biological_associations_resource_relationship_tmp.rewind
  @biological_associations_resource_relationship_tmp
end

#build_zipObject



747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
# File 'lib/export/dwca/data.rb', line 747

def build_zip
  t = Tempfile.new(filename)

  Zip::OutputStream.open(t) { |zos| }

  Zip::File.open(t.path, Zip::File::CREATE) do |zip|
    zip.add('data.tsv', all_data.path)

    zip.add('media.tsv', media_tmp.path) if media_extension
    zip.add('resource_relationships.tsv', biological_associations_resource_relationship_tmp.path) if biological_associations_extension

    zip.add('meta.xml', meta.path)
    zip.add('eml.xml', eml.path)
  end
  t
end

#cleanupTrue

Returns close and delete all temporary files.

Returns:

  • (True)

    close and delete all temporary files



782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
# File 'lib/export/dwca/data.rb', line 782

def cleanup

  Rails.logger.debug 'dwca_export: cleanup start'

  zipfile.close
  zipfile.unlink
  meta.close
  meta.unlink
  eml.close
  eml.unlink
  data.close
  data.unlink

  if biological_associations_extension
    biological_associations_resource_relationship_tmp.close
    biological_associations_resource_relationship_tmp.unlink
  end

  if media_extension
    media_tmp.close
    media_tmp.unlink
  end

  if predicate_options_present?
    predicate_data.close
    predicate_data.unlink
  end

  if taxonworks_options_present?
    taxonworks_extension_data.close
    taxonworks_extension_data.unlink
  end

  all_data.close
  all_data.unlink

  Rails.logger.debug 'dwca_export: cleanup end'

  true
end

#collecting_event_attributesObject

@return Array

1 row per CO per DA (type) on CE


435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
# File 'lib/export/dwca/data.rb', line 435

def collecting_event_attributes
  q = "WITH relevant_collection_objects AS (
      #{collection_objects.unscope(:order).select(:id, :collecting_event_id).to_sql}
  )

  SELECT
      relevant_collection_objects.id AS co_id,
      CONCAT('TW:DataAttribute:CollectingEvent:', cvt.name) AS predicate,
      da.value
  FROM
      data_attributes da
      JOIN collecting_events ce ON ce.id = da.attribute_subject_id
           AND da.attribute_subject_type = 'CollectingEvent'
           AND da.type = 'InternalAttribute'
      LEFT JOIN relevant_collection_objects ON ce.id = relevant_collection_objects.collecting_event_id
      JOIN controlled_vocabulary_terms cvt ON cvt.id = da.controlled_vocabulary_term_id
          AND cvt.type = 'Predicate'
  WHERE relevant_collection_objects.id IS NOT null"

  q = q + " AND da.controlled_vocabulary_term_id IN (#{collecting_event_predicate_ids.join(',')})" if collecting_event_predicate_ids.any?

  DataAttribute.connection.execute( q ).collect{|r| [r['co_id'], r['predicate'], r['value']] }
end

#collecting_event_attributes_queryObject

Returns Relation the unique attributes derived from CollectingEvents.

Returns:

  • Relation the unique attributes derived from CollectingEvents



424
425
426
427
428
429
430
431
# File 'lib/export/dwca/data.rb', line 424

def collecting_event_attributes_query
  s = 'WITH touched_collecting_events AS (' + collecting_events.to_sql + ') ' + ::InternalAttribute
    .joins("JOIN touched_collecting_events as tce1 on data_attributes.attribute_subject_id = tce1.id AND data_attributes.attribute_subject_type = 'CollectingEvent'")
    .where(controlled_vocabulary_term_id: collecting_event_predicate_ids)
    .to_sql

  ::InternalAttribute.from('(' + s + ') as data_attributes')
end

#collecting_event_predicate_idsObject



117
118
119
# File 'lib/export/dwca/data.rb', line 117

def collecting_event_predicate_ids
  @data_predicate_ids[:collecting_event_predicate_id]
end

#collecting_eventsObject

rubocop:enable Metrics/MethodLength



382
383
384
385
386
387
388
389
# File 'lib/export/dwca/data.rb', line 382

def collecting_events
  s = 'WITH co_scoped AS (' + collection_objects.unscope(:order).select(:id, :collecting_event_id).to_sql + ') ' + ::CollectingEvent
    .joins('JOIN co_scoped as co_scoped1 on co_scoped1.collecting_event_id = collecting_events.id')
    .distinct
    .to_sql

  ::CollectingEvent.from('(' + s + ') as collecting_events')
end

#collection_object_attributesObject



402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
# File 'lib/export/dwca/data.rb', line 402

def collection_object_attributes
  q = "WITH relevant_collection_objects AS (
      #{collection_objects.unscope(:order).select(:id).to_sql}
  )
  SELECT da.id, da.attribute_subject_id,
         CONCAT('TW:DataAttribute:CollectionObject:', cvt.name) AS predicate,
         da.value,
         da.controlled_vocabulary_term_id
  FROM data_attributes da
  JOIN relevant_collection_objects rco ON da.attribute_subject_id = rco.id
                                       AND da.attribute_subject_type = 'CollectionObject'
  JOIN controlled_vocabulary_terms cvt ON cvt.id = da.controlled_vocabulary_term_id
                                       AND cvt.type = 'Predicate'
  WHERE da.type = 'InternalAttribute'"

  q = q + " AND da.controlled_vocabulary_term_id IN (#{collection_object_predicate_ids.join(',')})" if collection_object_predicate_ids.any?

  DataAttribute.connection.execute( q ).collect{|r| [r['attribute_subject_id'], r['predicate'], r['value']] }
end

#collection_object_attributes_queryObject



391
392
393
394
395
396
397
398
399
400
# File 'lib/export/dwca/data.rb', line 391

def collection_object_attributes_query
  s = 'WITH touched_collection_objects AS (' + collection_objects.unscope(:order).select(:id).to_sql + ') ' + ::InternalAttribute
    .joins("JOIN touched_collection_objects as tco1 on data_attributes.attribute_subject_id = tco1.id AND data_attributes.attribute_subject_type = 'CollectionObject'")
    .to_sql

  ::InternalAttribute
    .joins(:predicate)
    .where(controlled_vocabulary_term_id: collection_object_predicate_ids)
    .from('(' + s + ') as data_attributes')
end

#collection_object_predicate_idsObject



113
114
115
# File 'lib/export/dwca/data.rb', line 113

def collection_object_predicate_ids
  @data_predicate_ids[:collection_object_predicate_id]
end

#collection_objectsObject



459
460
461
462
463
464
465
466
# File 'lib/export/dwca/data.rb', line 459

def collection_objects
  s = 'WITH dwc_scoped AS (' + core_scope.unscope(:order).select('dwc_occurrences.dwc_occurrence_object_id, dwc_occurrences.dwc_occurrence_object_type').to_sql + ') ' + ::CollectionObject
    .joins("JOIN dwc_scoped as dwc_scoped1 on dwc_scoped1.dwc_occurrence_object_id = collection_objects.id and dwc_scoped1.dwc_occurrence_object_type = 'CollectionObject'")
    .select(:id, :collecting_event_id, :type)
    .to_sql

  ::CollectionObject.from('(' + s + ') as collection_objects')
end

#csvCSV

Returns the data as a CSV object.

Returns:

  • (CSV)

    the data as a CSV object



174
175
176
177
178
179
180
181
182
183
184
185
# File 'lib/export/dwca/data.rb', line 174

def csv
  ::Export::CSV.generate_csv(
    core_scope.computed_columns,
    # TODO: check to see if we nee dthis
    exclude_columns: ::DwcOccurrence.excluded_columns,
    column_order: ::CollectionObject::DWC_OCCURRENCE_MAP.keys + ::CollectionObject::EXTENSION_FIELDS, # TODO: add other maps here
    trim_columns: true, # going to have to be optional
    trim_rows: false,
    header_converters: [:dwc_headers],
    copy_column: { from: 'occurrenceID', to: 'id' }
  )
end

#extension_computed_fields_data(methods) ⇒ Object

TODO: return, or optimize to this when ::CollectionObject::EXTENSION_COMPUTED_FIELDS.size > 1 def extension_computed_fields_data(methods)

d = []
collection_objects.find_each do |object|
  methods.each_pair { |method, name| d  << [object.id, name, object.send(method)] }
end
d

end

!! This will have to be reverted to above when > 1 EXTENSION field is present



235
236
237
238
239
240
241
242
243
244
245
246
247
248
# File 'lib/export/dwca/data.rb', line 235

def extension_computed_fields_data(methods)
  return [] if methods.empty?

  a = "TW:Internal:otu_name".freeze

  # n = "COALESCE( otus.name, TRIM(CONCAT(cached, ' ', cached_author_year))) as otu_name"

  v = collection_objects.left_joins(otu: [:taxon_name])
    .select("collection_objects.id, otus.name as otu_name")
    .where(taxon_determinations: {position: '1'})
    .find_each(batch_size: 10000)
    .collect{|r| [r.id, a, r['otu_name'].presence] }
  v
end

#media_tmpObject



656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
# File 'lib/export/dwca/data.rb', line 656

def media_tmp
  return nil if media_extension.nil? || media_extension.empty?
  @media_tmp = Tempfile.new('media.xml')

  content = nil
  if no_records?
    content = "\n"
  else
    content = Export::CSV::Dwc::Extension::Media.csv(media_extension[:collection_objects], media_extension[:field_occurrences])
  end

  @media_tmp.write(content)
  @media_tmp.flush
  @media_tmp.rewind
  @media_tmp
end

#meta_fieldsArray

non-standard DwC colums are handled elsewhere

Returns:

  • (Array)

    use the temporarily written, and refined, CSV file to read off the existing headers so we can use them in writing meta.yml



677
678
679
680
681
682
# File 'lib/export/dwca/data.rb', line 677

def meta_fields
  return [] if no_records?
  h = File.open(all_data, &:gets)&.strip&.split("\t")
  h&.shift # shift because the first column, id, will be specified by hand
  h || []
end

#no_records?Boolean

Returns true if provided core_scope returns no records.

Returns:

  • (Boolean)

    true if provided core_scope returns no records



189
190
191
# File 'lib/export/dwca/data.rb', line 189

def no_records?
  total == 0
end

#package_download(download) ⇒ Object

Parameters:



824
825
826
827
828
829
# File 'lib/export/dwca/data.rb', line 824

def package_download(download)
  p = zipfile.path

  # This doesn't touch the db (source_file_path is an instance var).
  download.update!(source_file_path: p)
end

#predicate_options_present?Boolean

Returns:

  • (Boolean)


160
161
162
# File 'lib/export/dwca/data.rb', line 160

def predicate_options_present?
  data_predicate_ids[:collection_object_predicate_id].present? || data_predicate_ids[:collecting_event_predicate_id].present?
end

#taxonworks_options_present?Boolean

Returns:

  • (Boolean)


164
165
166
# File 'lib/export/dwca/data.rb', line 164

def taxonworks_options_present?
  taxonworks_extension_methods.present?
end

#used_collecting_event_predicatesObject



474
475
476
477
478
# File 'lib/export/dwca/data.rb', line 474

def used_collecting_event_predicates
  collecting_event_attributes_query.joins(:predicate).select("CONCAT('TW:DataAttribute:CollectingEvent:', controlled_vocabulary_terms.name) predicate_name")
    .distinct
    .collect{|r| r['predicate_name']}
end

#used_collection_object_predicatesObject



468
469
470
471
472
# File 'lib/export/dwca/data.rb', line 468

def used_collection_object_predicates
  collection_object_attributes_query.select("CONCAT('TW:DataAttribute:CollectionObject:', controlled_vocabulary_terms.name) predicate_name")
    .distinct
    .collect{|r| r['predicate_name']}
end

#used_predicatesArray

Returns of distinct Predicate names in the format

`TW:DataAttribute:<CollectingEvent|CollectionObject>:<name>`.

Returns:

  • (Array)

    of distinct Predicate names in the format

    `TW:DataAttribute:<CollectingEvent|CollectionObject>:<name>`
    


483
484
485
# File 'lib/export/dwca/data.rb', line 483

def used_predicates
  used_collection_object_predicates + used_collecting_event_predicates
end