Class: Export::Dwca::Data

Inherits:
Object
  • Object
show all
Defined in:
lib/export/dwca/data.rb

Overview

!! !! This export does not support AssertedDistribution data at the moment. While those data are indexed, !! if they are in the ‘core_scope` they will almost certainly cause problems or be ignored. !!

Wrapper to build DWCA zipfiles for a specific project. See tasks/accesssions/report/dwc_controller.rb for use.

With help from thinkingeek.com/2013/11/15/create-temporary-zip-file-send-response-rails/

Usage:

begin
 data = Dwca::Data.new(DwcOccurrence.where(project_id: sessions_current_project_id)
ensure
 data.cleanup
end

Always use the ensure/data.cleanup pattern!

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(core_scope: nil, extension_scopes: {}, predicate_extensions: {}, eml_data: {}, taxonworks_extensions: []) ⇒ Data

Returns a new instance of Data.

Parameters:

  • taxonworks_extensions (Array<Symbol>) (defaults to: [])

    List of methods to perform on each CO

Raises:

  • (ArgumentError)


84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/export/dwca/data.rb', line 84

def initialize(core_scope: nil, extension_scopes: {}, predicate_extensions: {}, eml_data: {}, taxonworks_extensions: [])
  raise ArgumentError, 'must pass a core_scope' if core_scope.nil?

  @core_scope = core_scope

  @biological_associations_extension = extension_scopes[:biological_associations] #! Hash with keys core_params, collection_objects_query
  @media_extension = extension_scopes[:media] #! Hash with keys collection_objects, field_occurrences

  @data_predicate_ids = { collection_object_predicate_id: [], collecting_event_predicate_id: [] }.merge(predicate_extensions)

  @eml_data = eml_data

  @taxonworks_extension_methods = taxonworks_extensions
end

Instance Attribute Details

#all_dataObject

Returns Tempfile.

Returns:

  • Tempfile



77
78
79
# File 'lib/export/dwca/data.rb', line 77

def all_data
  @all_data
end

#biological_associations_extensionHash

Returns of collection_objects: query_string, field_occurrences: query_string.

Returns:

  • (Hash)

    of collection_objects: query_string, field_occurrences: query_string



51
52
53
# File 'lib/export/dwca/data.rb', line 51

def biological_associations_extension
  @biological_associations_extension
end

#collection_object_idsObject

TODO Breaks when AssertedDistribution is added



69
70
71
# File 'lib/export/dwca/data.rb', line 69

def collection_object_ids
  @collection_object_ids
end

#core_scopeObject

!params core_scope [String, ActiveRecord::Relation]

String is fully formed SQL


48
49
50
# File 'lib/export/dwca/data.rb', line 48

def core_scope
  @core_scope
end

#dataTempfile

Returns the csv data as a tempfile.

Returns:

  • (Tempfile)

    the csv data as a tempfile



197
198
199
# File 'lib/export/dwca/data.rb', line 197

def data
  @data
end

#data_predicate_idsObject

collection_object_predicate_id: [], collecting_event_predicate_id: []

Returns:

  • Hash



64
65
66
# File 'lib/export/dwca/data.rb', line 64

def data_predicate_ids
  @data_predicate_ids
end

#dwc_id_orderObject

Get order of ids that matches core records so we can align with csv zero! Like 2=>1, 3=>2, 4=>3, 5=>4

Returns:

  • Hash



81
82
83
# File 'lib/export/dwca/data.rb', line 81

def dwc_id_order
  @dwc_id_order
end

#emlTempfile

This is a stub, and only half-heartedly done. You should be using IPT for the time being. See also

https://github.com/gbif/ipt/wiki/
https://github.com/gbif/ipt/wiki/#exemplar-datasets

TODO: reference biological_resource_extension.csv

Returns:

  • (Tempfile)

    metadata about this dataset



591
592
593
# File 'lib/export/dwca/data.rb', line 591

def eml
  @eml
end

#eml_dataHash

for use in construction of the eml file.

Returns:

  • (Hash)

    containing dataset and additional_metadata, as xml strings,



38
39
40
# File 'lib/export/dwca/data.rb', line 38

def eml_data
  @eml_data
end

#filenameString (readonly)

the name of zipfile

Returns:

  • (String)


773
774
775
# File 'lib/export/dwca/data.rb', line 773

def filename
  @filename
end

#media_extensionHash

Returns of collection_objects: query_string, field_occurrences: query_string.

Returns:

  • (Hash)

    of collection_objects: query_string, field_occurrences: query_string



54
55
56
# File 'lib/export/dwca/data.rb', line 54

def media_extension
  @media_extension
end

#metaObject

Returns the value of attribute meta.



42
43
44
# File 'lib/export/dwca/data.rb', line 42

def meta
  @meta
end

#predicate_dataObject

Returns the value of attribute predicate_data.



60
61
62
# File 'lib/export/dwca/data.rb', line 60

def predicate_data
  @predicate_data
end

#taxonworks_extension_dataObject

rubocop:disable Metrics/MethodLength



254
255
256
# File 'lib/export/dwca/data.rb', line 254

def taxonworks_extension_data
  @taxonworks_extension_data
end

#taxonworks_extension_methodsObject

Returns the value of attribute taxonworks_extension_methods.



74
75
76
# File 'lib/export/dwca/data.rb', line 74

def taxonworks_extension_methods
  @taxonworks_extension_methods
end

#totalObject

TODO update



56
57
58
# File 'lib/export/dwca/data.rb', line 56

def total
  @total
end

#zipfileTempfile

Returns the zipfile.

Returns:

  • (Tempfile)

    the zipfile



764
765
766
# File 'lib/export/dwca/data.rb', line 764

def zipfile
  @zipfile
end

Instance Method Details

#biological_association_relations_to_coreObject

rubocop:enable Metrics/MethodLength



610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
# File 'lib/export/dwca/data.rb', line 610

def biological_association_relations_to_core
  core_params = {
    dwc_occurrence_query: @biological_associations_extension[:core_params]
  }

  subject_biological_associations =
    ::Queries::BiologicalAssociation::Filter.new(
      collection_object_query: core_params,
      collection_object_as_subject_or_as_object: :subject
    ).all

  object_biological_associations =
    ::Queries::BiologicalAssociation::Filter.new(
      collection_object_query: core_params,
      collection_object_as_subject_or_as_object: :object
    ).all

  {
    subject: Set.new(subject_biological_associations.pluck(:id)),
    object: Set.new(object_biological_associations.pluck(:id))
  }
end

#biological_associations_resource_relationship_tmpObject



633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
# File 'lib/export/dwca/data.rb', line 633

def biological_associations_resource_relationship_tmp
  return nil if biological_associations_extension.nil?
  @biological_associations_resource_relationship_tmp = Tempfile.new('biological_resource_relationship.xml')

  content = nil

  if no_records?
    content = "\n"
  else
    content = Export::CSV::Dwc::Extension::BiologicalAssociations.csv(
      biological_associations_extension,
      biological_association_relations_to_core
    )
  end

  @biological_associations_resource_relationship_tmp.write(content)
  @biological_associations_resource_relationship_tmp.flush
  @biological_associations_resource_relationship_tmp.rewind
  @biological_associations_resource_relationship_tmp
end

#build_zipObject



745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
# File 'lib/export/dwca/data.rb', line 745

def build_zip
  t = Tempfile.new(filename)

  Zip::OutputStream.open(t) { |zos| }

  Zip::File.open(t.path, create: true) do |zip|
    zip.add('data.tsv', all_data.path)

    zip.add('media.tsv', media_tmp.path) if media_extension
    zip.add('resource_relationships.tsv', biological_associations_resource_relationship_tmp.path) if biological_associations_extension

    zip.add('meta.xml', meta.path)
    zip.add('eml.xml', eml.path)
  end
  t
end

#cleanupTrue

Returns close and delete all temporary files.

Returns:

  • (True)

    close and delete all temporary files



780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
# File 'lib/export/dwca/data.rb', line 780

def cleanup

  Rails.logger.debug 'dwca_export: cleanup start'

  zipfile.close
  zipfile.unlink
  meta.close
  meta.unlink
  eml.close
  eml.unlink
  data.close
  data.unlink

  if biological_associations_extension
    biological_associations_resource_relationship_tmp.close
    biological_associations_resource_relationship_tmp.unlink
  end

  if media_extension
    media_tmp.close
    media_tmp.unlink
  end

  if predicate_options_present?
    predicate_data.close
    predicate_data.unlink
  end

  if taxonworks_options_present?
    taxonworks_extension_data.close
    taxonworks_extension_data.unlink
  end

  all_data.close
  all_data.unlink

  Rails.logger.debug 'dwca_export: cleanup end'

  true
end

#collecting_event_attributesObject

@return Array

1 row per CO per DA (type) on CE


437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
# File 'lib/export/dwca/data.rb', line 437

def collecting_event_attributes
  q = "WITH relevant_collection_objects AS (
      #{collection_objects.unscope(:order).select(:id, :collecting_event_id).to_sql}
  )

  SELECT
      relevant_collection_objects.id AS co_id,
      CONCAT('TW:DataAttribute:CollectingEvent:', cvt.name) AS predicate,
      da.value
  FROM
      data_attributes da
      JOIN collecting_events ce ON ce.id = da.attribute_subject_id
           AND da.attribute_subject_type = 'CollectingEvent'
           AND da.type = 'InternalAttribute'
      LEFT JOIN relevant_collection_objects ON ce.id = relevant_collection_objects.collecting_event_id
      JOIN controlled_vocabulary_terms cvt ON cvt.id = da.controlled_vocabulary_term_id
          AND cvt.type = 'Predicate'
  WHERE relevant_collection_objects.id IS NOT null"

  q = q + " AND da.controlled_vocabulary_term_id IN (#{collecting_event_predicate_ids.join(',')})" if collecting_event_predicate_ids.any?

  DataAttribute.connection.execute( q ).collect{|r| [r['co_id'], r['predicate'], r['value']] }
end

#collecting_event_attributes_queryObject

Returns Relation the unique attributes derived from CollectingEvents.

Returns:

  • Relation the unique attributes derived from CollectingEvents



426
427
428
429
430
431
432
433
# File 'lib/export/dwca/data.rb', line 426

def collecting_event_attributes_query
  s = 'WITH touched_collecting_events AS (' + collecting_events.to_sql + ') ' + ::InternalAttribute
    .joins("JOIN touched_collecting_events as tce1 on data_attributes.attribute_subject_id = tce1.id AND data_attributes.attribute_subject_type = 'CollectingEvent'")
    .where(controlled_vocabulary_term_id: collecting_event_predicate_ids)
    .to_sql

  ::InternalAttribute.from('(' + s + ') as data_attributes')
end

#collecting_event_predicate_idsObject



117
118
119
# File 'lib/export/dwca/data.rb', line 117

def collecting_event_predicate_ids
  @data_predicate_ids[:collecting_event_predicate_id]
end

#collecting_eventsObject

rubocop:enable Metrics/MethodLength



384
385
386
387
388
389
390
391
# File 'lib/export/dwca/data.rb', line 384

def collecting_events
  s = 'WITH co_scoped AS (' + collection_objects.unscope(:order).select(:id, :collecting_event_id).to_sql + ') ' + ::CollectingEvent
    .joins('JOIN co_scoped as co_scoped1 on co_scoped1.collecting_event_id = collecting_events.id')
    .distinct
    .to_sql

  ::CollectingEvent.from('(' + s + ') as collecting_events')
end

#collection_object_attributesObject



404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
# File 'lib/export/dwca/data.rb', line 404

def collection_object_attributes
  q = "WITH relevant_collection_objects AS (
      #{collection_objects.unscope(:order).select(:id).to_sql}
  )
  SELECT da.id, da.attribute_subject_id,
         CONCAT('TW:DataAttribute:CollectionObject:', cvt.name) AS predicate,
         da.value,
         da.controlled_vocabulary_term_id
  FROM data_attributes da
  JOIN relevant_collection_objects rco ON da.attribute_subject_id = rco.id
                                       AND da.attribute_subject_type = 'CollectionObject'
  JOIN controlled_vocabulary_terms cvt ON cvt.id = da.controlled_vocabulary_term_id
                                       AND cvt.type = 'Predicate'
  WHERE da.type = 'InternalAttribute'"

  q = q + " AND da.controlled_vocabulary_term_id IN (#{collection_object_predicate_ids.join(',')})" if collection_object_predicate_ids.any?

  DataAttribute.connection.execute( q ).collect{|r| [r['attribute_subject_id'], r['predicate'], r['value']] }
end

#collection_object_attributes_queryObject



393
394
395
396
397
398
399
400
401
402
# File 'lib/export/dwca/data.rb', line 393

def collection_object_attributes_query
  s = 'WITH touched_collection_objects AS (' + collection_objects.unscope(:order).select(:id).to_sql + ') ' + ::InternalAttribute
    .joins("JOIN touched_collection_objects as tco1 on data_attributes.attribute_subject_id = tco1.id AND data_attributes.attribute_subject_type = 'CollectionObject'")
    .to_sql

  ::InternalAttribute
    .joins(:predicate)
    .where(controlled_vocabulary_term_id: collection_object_predicate_ids)
    .from('(' + s + ') as data_attributes')
end

#collection_object_predicate_idsObject



113
114
115
# File 'lib/export/dwca/data.rb', line 113

def collection_object_predicate_ids
  @data_predicate_ids[:collection_object_predicate_id]
end

#collection_objectsObject



461
462
463
464
465
466
467
468
# File 'lib/export/dwca/data.rb', line 461

def collection_objects
  s = 'WITH dwc_scoped AS (' + core_scope.unscope(:order).select('dwc_occurrences.dwc_occurrence_object_id, dwc_occurrences.dwc_occurrence_object_type').to_sql + ') ' + ::CollectionObject
    .joins("JOIN dwc_scoped as dwc_scoped1 on dwc_scoped1.dwc_occurrence_object_id = collection_objects.id and dwc_scoped1.dwc_occurrence_object_type = 'CollectionObject'")
    .select(:id, :collecting_event_id, :type)
    .to_sql

  ::CollectionObject.from('(' + s + ') as collection_objects')
end

#csvCSV

Returns the data as a CSV object.

Returns:

  • (CSV)

    the data as a CSV object



176
177
178
179
180
181
182
183
184
185
186
187
# File 'lib/export/dwca/data.rb', line 176

def csv
  ::Export::CSV.generate_csv(
    core_scope.computed_columns,
    # TODO: check to see if we nee dthis
    exclude_columns: ::DwcOccurrence.excluded_columns,
    column_order: ::CollectionObject::DWC_OCCURRENCE_MAP.keys + ::CollectionObject::EXTENSION_FIELDS, # TODO: add other maps here
    trim_columns: true, # going to have to be optional
    trim_rows: false,
    header_converters: [:dwc_headers],
    copy_column: { from: 'occurrenceID', to: 'id' }
  )
end

#extension_computed_fields_data(methods) ⇒ Object

TODO: return, or optimize to this when ::CollectionObject::EXTENSION_COMPUTED_FIELDS.size > 1 def extension_computed_fields_data(methods)

d = []
collection_objects.find_each do |object|
  methods.each_pair { |method, name| d  << [object.id, name, object.send(method)] }
end
d

end

!! This will have to be reverted to above when > 1 EXTENSION field is present



237
238
239
240
241
242
243
244
245
246
247
248
249
250
# File 'lib/export/dwca/data.rb', line 237

def extension_computed_fields_data(methods)
  return [] if methods.empty?

  a = 'TW:Internal:otu_name'.freeze

  # n = "COALESCE( otus.name, TRIM(CONCAT(cached, ' ', cached_author_year))) as otu_name"

  v = collection_objects.left_joins(otu: [:taxon_name])
    .select('collection_objects.id, otus.name as otu_name')
    .where(taxon_determinations: {position: '1'})
    .find_each(batch_size: 10000)
    .collect{|r| [r.id, a, r['otu_name'].presence] }
  v
end

#media_tmpObject



654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
# File 'lib/export/dwca/data.rb', line 654

def media_tmp
  return nil if media_extension.nil? || media_extension.empty?
  @media_tmp = Tempfile.new('media.xml')

  content = nil
  if no_records?
    content = "\n"
  else
    content = Export::CSV::Dwc::Extension::Media.csv(media_extension[:collection_objects], media_extension[:field_occurrences])
  end

  @media_tmp.write(content)
  @media_tmp.flush
  @media_tmp.rewind
  @media_tmp
end

#meta_fieldsArray

non-standard DwC colums are handled elsewhere

Returns:

  • (Array)

    use the temporarily written, and refined, CSV file to read off the existing headers so we can use them in writing meta.yml



675
676
677
678
679
680
# File 'lib/export/dwca/data.rb', line 675

def meta_fields
  return [] if no_records?
  h = File.open(all_data, &:gets)&.strip&.split("\t")
  h&.shift # shift because the first column, id, will be specified by hand
  h || []
end

#no_records?Boolean

Returns true if provided core_scope returns no records.

Returns:

  • (Boolean)

    true if provided core_scope returns no records



191
192
193
# File 'lib/export/dwca/data.rb', line 191

def no_records?
  total == 0
end

#package_download(download) ⇒ Object

Parameters:



822
823
824
825
826
827
# File 'lib/export/dwca/data.rb', line 822

def package_download(download)
  p = zipfile.path

  # This doesn't touch the db (source_file_path is an instance var).
  download.update!(source_file_path: p)
end

#predicate_options_present?Boolean

Returns:

  • (Boolean)


162
163
164
# File 'lib/export/dwca/data.rb', line 162

def predicate_options_present?
  data_predicate_ids[:collection_object_predicate_id].present? || data_predicate_ids[:collecting_event_predicate_id].present?
end

#taxonworks_options_present?Boolean

Returns:

  • (Boolean)


166
167
168
# File 'lib/export/dwca/data.rb', line 166

def taxonworks_options_present?
  taxonworks_extension_methods.present?
end

#used_collecting_event_predicatesObject



476
477
478
479
480
# File 'lib/export/dwca/data.rb', line 476

def used_collecting_event_predicates
  collecting_event_attributes_query.joins(:predicate).select("CONCAT('TW:DataAttribute:CollectingEvent:', controlled_vocabulary_terms.name) predicate_name")
    .distinct
    .collect{|r| r['predicate_name']}
end

#used_collection_object_predicatesObject



470
471
472
473
474
# File 'lib/export/dwca/data.rb', line 470

def used_collection_object_predicates
  collection_object_attributes_query.select("CONCAT('TW:DataAttribute:CollectionObject:', controlled_vocabulary_terms.name) predicate_name")
    .distinct
    .collect{|r| r['predicate_name']}
end

#used_predicatesArray

Returns of distinct Predicate names in the format

`TW:DataAttribute:<CollectingEvent|CollectionObject>:<name>`.

Returns:

  • (Array)

    of distinct Predicate names in the format

    `TW:DataAttribute:<CollectingEvent|CollectionObject>:<name>`
    


485
486
487
# File 'lib/export/dwca/data.rb', line 485

def used_predicates
  used_collection_object_predicates + used_collecting_event_predicates
end