Class: Export::Dwca::Data

Inherits:
Object
  • Object
show all
Defined in:
lib/export/dwca/data.rb

Overview

!! !! This export does not support AssertedDistribution data at the moment. While those data are indexed, !! if they are in the ‘core_scope` they will almost certainly cause problems or be ignored. !!

Wrapper to build DWCA zipfiles for a specific project. See tasks/accesssions/report/dwc_controller.rb for use.

With help from thinkingeek.com/2013/11/15/create-temporary-zip-file-send-response-rails/

Usage:

begin
 data = Dwca::Data.new(DwcOccurrence.where(project_id: sessions_current_project_id)
ensure
 data.cleanup
end

Always use the ensure/data.cleanup pattern!

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(core_scope: nil, extension_scopes: {}, predicate_extensions: {}, eml_data: {}, taxonworks_extensions: []) ⇒ Data

Returns a new instance of Data.

Parameters:

  • taxonworks_extensions (Array<Symbol>) (defaults to: [])

    List of methods to perform on each CO

Raises:

  • (ArgumentError)


84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/export/dwca/data.rb', line 84

def initialize(core_scope: nil, extension_scopes: {}, predicate_extensions: {}, eml_data: {}, taxonworks_extensions: [])
  raise ArgumentError, 'must pass a core_scope' if core_scope.nil?

  @core_scope = core_scope

  @biological_associations_extension = extension_scopes[:biological_associations] #! Hash with keys core_params, collection_objects_query
  @media_extension = extension_scopes[:media] #! Hash with keys collection_objects, field_occurrences

  @data_predicate_ids = { collection_object_predicate_id: [], collecting_event_predicate_id: [] }.merge(predicate_extensions)

  @eml_data = eml_data

  @taxonworks_extension_methods = taxonworks_extensions
end

Instance Attribute Details

#all_dataObject

Returns Tempfile.

Returns:

  • Tempfile



77
78
79
# File 'lib/export/dwca/data.rb', line 77

def all_data
  @all_data
end

#biological_associations_extensionHash

Returns of collection_objects: query_string, field_occurrences: query_string.

Returns:

  • (Hash)

    of collection_objects: query_string, field_occurrences: query_string



51
52
53
# File 'lib/export/dwca/data.rb', line 51

def biological_associations_extension
  @biological_associations_extension
end

#collection_object_idsObject

TODO Breaks when AssertedDistribution is added



69
70
71
# File 'lib/export/dwca/data.rb', line 69

def collection_object_ids
  @collection_object_ids
end

#core_scopeObject

!params core_scope [String, ActiveRecord::Relation]

String is fully formed SQL


48
49
50
# File 'lib/export/dwca/data.rb', line 48

def core_scope
  @core_scope
end

#dataTempfile

Returns the csv data as a tempfile.

Returns:

  • (Tempfile)

    the csv data as a tempfile



192
193
194
# File 'lib/export/dwca/data.rb', line 192

def data
  @data
end

#data_predicate_idsObject

collection_object_predicate_id: [], collecting_event_predicate_id: []

Returns:

  • Hash



64
65
66
# File 'lib/export/dwca/data.rb', line 64

def data_predicate_ids
  @data_predicate_ids
end

#dwc_id_orderObject

Get order of ids that matches core records so we can align with csv zero! Like 2=>1, 3=>2, 4=>3, 5=>4

Returns:

  • Hash



81
82
83
# File 'lib/export/dwca/data.rb', line 81

def dwc_id_order
  @dwc_id_order
end

#emlTempfile

This is a stub, and only half-heartedly done. You should be using IPT for the time being. See also

https://github.com/gbif/ipt/wiki/
https://github.com/gbif/ipt/wiki/#exemplar-datasets

TODO: reference biological_resource_extension.csv

Returns:

  • (Tempfile)

    metadata about this dataset



586
587
588
# File 'lib/export/dwca/data.rb', line 586

def eml
  @eml
end

#eml_dataHash

for use in construction of the eml file.

Returns:

  • (Hash)

    containing dataset and additional_metadata, as xml strings,



38
39
40
# File 'lib/export/dwca/data.rb', line 38

def eml_data
  @eml_data
end

#filenameString (readonly)

the name of zipfile

Returns:

  • (String)


765
766
767
# File 'lib/export/dwca/data.rb', line 765

def filename
  @filename
end

#media_extensionHash

Returns of collection_objects: query_string, field_occurrences: query_string.

Returns:

  • (Hash)

    of collection_objects: query_string, field_occurrences: query_string



54
55
56
# File 'lib/export/dwca/data.rb', line 54

def media_extension
  @media_extension
end

#metaObject

Returns the value of attribute meta.



42
43
44
# File 'lib/export/dwca/data.rb', line 42

def meta
  @meta
end

#predicate_dataObject

Returns the value of attribute predicate_data.



60
61
62
# File 'lib/export/dwca/data.rb', line 60

def predicate_data
  @predicate_data
end

#taxonworks_extension_dataObject

rubocop:disable Metrics/MethodLength



249
250
251
# File 'lib/export/dwca/data.rb', line 249

def taxonworks_extension_data
  @taxonworks_extension_data
end

#taxonworks_extension_methodsObject

Returns the value of attribute taxonworks_extension_methods.



74
75
76
# File 'lib/export/dwca/data.rb', line 74

def taxonworks_extension_methods
  @taxonworks_extension_methods
end

#totalObject

TODO update



56
57
58
# File 'lib/export/dwca/data.rb', line 56

def total
  @total
end

#zipfileTempfile

Returns the zipfile.

Returns:

  • (Tempfile)

    the zipfile



756
757
758
# File 'lib/export/dwca/data.rb', line 756

def zipfile
  @zipfile
end

Instance Method Details

#biological_association_relations_to_coreObject

rubocop:enable Metrics/MethodLength



605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
# File 'lib/export/dwca/data.rb', line 605

def biological_association_relations_to_core
  core_params = {
    dwc_occurrence_query: @biological_associations_extension[:core_params]
  }

  subject_biological_associations =
    ::Queries::BiologicalAssociation::Filter.new(
      collection_object_query: core_params,
      collection_object_as_subject_or_as_object: :subject
    ).all

  object_biological_associations =
    ::Queries::BiologicalAssociation::Filter.new(
      collection_object_query: core_params,
      collection_object_as_subject_or_as_object: :object
    ).all

  {
    subject: Set.new(subject_biological_associations.pluck(:id)),
    object: Set.new(object_biological_associations.pluck(:id))
  }
end

#biological_associations_resource_relationship_tmpObject



628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
# File 'lib/export/dwca/data.rb', line 628

def biological_associations_resource_relationship_tmp
  return nil if biological_associations_extension.nil?
  @biological_associations_resource_relationship_tmp = Tempfile.new('biological_resource_relationship.xml')

  content = nil

  if no_records?
    content = "\n"
  else
    content = Export::CSV::Dwc::Extension::BiologicalAssociations.csv(biological_associations_extension, biological_association_relations_to_core)
  end

  @biological_associations_resource_relationship_tmp.write(content)
  @biological_associations_resource_relationship_tmp.flush
  @biological_associations_resource_relationship_tmp.rewind
  @biological_associations_resource_relationship_tmp
end

#build_zipObject



737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
# File 'lib/export/dwca/data.rb', line 737

def build_zip
  t = Tempfile.new(filename)

  Zip::OutputStream.open(t) { |zos| }

  Zip::File.open(t.path, Zip::File::CREATE) do |zip|
    zip.add('data.tsv', all_data.path)

    zip.add('media.tsv', media_tmp.path) if media_extension
    zip.add('resource_relationships.tsv', biological_associations_resource_relationship_tmp.path) if biological_associations_extension

    zip.add('meta.xml', meta.path)
    zip.add('eml.xml', eml.path)
  end
  t
end

#cleanupTrue

Returns close and delete all temporary files.

Returns:

  • (True)

    close and delete all temporary files



772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
# File 'lib/export/dwca/data.rb', line 772

def cleanup

  Rails.logger.debug 'dwca_export: cleanup start'

  zipfile.close
  zipfile.unlink
  meta.close
  meta.unlink
  eml.close
  eml.unlink
  data.close
  data.unlink

  if biological_associations_extension
    biological_associations_resource_relationship_tmp.close
    biological_associations_resource_relationship_tmp.unlink
  end

  if media_extension
    media_tmp.close
    media_tmp.unlink
  end

  if predicate_options_present?
    predicate_data.close
    predicate_data.unlink
  end

  if taxonworks_options_present?
    taxonworks_extension_data.close
    taxonworks_extension_data.unlink
  end

  all_data.close
  all_data.unlink

  Rails.logger.debug 'dwca_export: cleanup end'

  true
end

#collecting_event_attributesObject

@return Array

1 row per CO per DA (type) on CE


432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
# File 'lib/export/dwca/data.rb', line 432

def collecting_event_attributes
  q = "WITH relevant_collection_objects AS (
      #{collection_objects.unscope(:order).select(:id, :collecting_event_id).to_sql}
  )

  SELECT
      relevant_collection_objects.id AS co_id,
      CONCAT('TW:DataAttribute:CollectingEvent:', cvt.name) AS predicate,
      da.value
  FROM
      data_attributes da
      JOIN collecting_events ce ON ce.id = da.attribute_subject_id
           AND da.attribute_subject_type = 'CollectingEvent'
           AND da.type = 'InternalAttribute'
      LEFT JOIN relevant_collection_objects ON ce.id = relevant_collection_objects.collecting_event_id
      JOIN controlled_vocabulary_terms cvt ON cvt.id = da.controlled_vocabulary_term_id
          AND cvt.type = 'Predicate'
  WHERE relevant_collection_objects.id IS NOT null"

  q = q + " AND da.controlled_vocabulary_term_id IN (#{collecting_event_predicate_ids.join(',')})" if collecting_event_predicate_ids.any?

  DataAttribute.connection.execute( q ).collect{|r| [r['co_id'], r['predicate'], r['value']] }
end

#collecting_event_attributes_queryObject

Returns Relation the unique attributes derived from CollectingEvents.

Returns:

  • Relation the unique attributes derived from CollectingEvents



421
422
423
424
425
426
427
428
# File 'lib/export/dwca/data.rb', line 421

def collecting_event_attributes_query
  s = 'WITH touched_collecting_events AS (' + collecting_events.to_sql + ') ' + ::InternalAttribute
    .joins("JOIN touched_collecting_events as tce1 on data_attributes.attribute_subject_id = tce1.id AND data_attributes.attribute_subject_type = 'CollectingEvent'")
    .where(controlled_vocabulary_term_id: collecting_event_predicate_ids)
    .to_sql

  ::InternalAttribute.from('(' + s + ') as data_attributes')
end

#collecting_event_predicate_idsObject



117
118
119
# File 'lib/export/dwca/data.rb', line 117

def collecting_event_predicate_ids
  @data_predicate_ids[:collecting_event_predicate_id]
end

#collecting_eventsObject

rubocop:enable Metrics/MethodLength



379
380
381
382
383
384
385
386
# File 'lib/export/dwca/data.rb', line 379

def collecting_events
  s = 'WITH co_scoped AS (' + collection_objects.unscope(:order).select(:id, :collecting_event_id).to_sql + ') ' + ::CollectingEvent
    .joins('JOIN co_scoped as co_scoped1 on co_scoped1.collecting_event_id = collecting_events.id')
    .distinct
    .to_sql

  ::CollectingEvent.from('(' + s + ') as collecting_events')
end

#collection_object_attributesObject



399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
# File 'lib/export/dwca/data.rb', line 399

def collection_object_attributes
  q = "WITH relevant_collection_objects AS (
      #{collection_objects.unscope(:order).select(:id).to_sql}
  )
  SELECT da.id, da.attribute_subject_id,
         CONCAT('TW:DataAttribute:CollectionObject:', cvt.name) AS predicate,
         da.value,
         da.controlled_vocabulary_term_id
  FROM data_attributes da
  JOIN relevant_collection_objects rco ON da.attribute_subject_id = rco.id
                                       AND da.attribute_subject_type = 'CollectionObject'
  JOIN controlled_vocabulary_terms cvt ON cvt.id = da.controlled_vocabulary_term_id
                                       AND cvt.type = 'Predicate'
  WHERE da.type = 'InternalAttribute'"

  q = q + " AND da.controlled_vocabulary_term_id IN (#{collection_object_predicate_ids.join(',')})" if collection_object_predicate_ids.any?

  DataAttribute.connection.execute( q ).collect{|r| [r['attribute_subject_id'], r['predicate'], r['value']] }
end

#collection_object_attributes_queryObject



388
389
390
391
392
393
394
395
396
397
# File 'lib/export/dwca/data.rb', line 388

def collection_object_attributes_query
  s = 'WITH touched_collection_objects AS (' + collection_objects.unscope(:order).select(:id).to_sql + ') ' + ::InternalAttribute
    .joins("JOIN touched_collection_objects as tco1 on data_attributes.attribute_subject_id = tco1.id AND data_attributes.attribute_subject_type = 'CollectionObject'")
    .to_sql

  ::InternalAttribute
    .joins(:predicate)
    .where(controlled_vocabulary_term_id: collection_object_predicate_ids)
    .from('(' + s + ') as data_attributes')
end

#collection_object_predicate_idsObject



113
114
115
# File 'lib/export/dwca/data.rb', line 113

def collection_object_predicate_ids
  @data_predicate_ids[:collection_object_predicate_id]
end

#collection_objectsObject



456
457
458
459
460
461
462
463
# File 'lib/export/dwca/data.rb', line 456

def collection_objects
  s = 'WITH dwc_scoped AS (' + core_scope.unscope(:order).select('dwc_occurrences.dwc_occurrence_object_id, dwc_occurrences.dwc_occurrence_object_type').to_sql + ') ' + ::CollectionObject
    .joins("JOIN dwc_scoped as dwc_scoped1 on dwc_scoped1.dwc_occurrence_object_id = collection_objects.id and dwc_scoped1.dwc_occurrence_object_type = 'CollectionObject'")
    .select(:id, :collecting_event_id, :type)
    .to_sql

  ::CollectionObject.from('(' + s + ') as collection_objects')
end

#csvCSV

Returns the data as a CSV object.

Returns:

  • (CSV)

    the data as a CSV object



171
172
173
174
175
176
177
178
179
180
181
182
# File 'lib/export/dwca/data.rb', line 171

def csv
  ::Export::CSV.generate_csv(
    core_scope.computed_columns,
    # TODO: check to see if we nee dthis
    exclude_columns: ::DwcOccurrence.excluded_columns,
    column_order: ::CollectionObject::DWC_OCCURRENCE_MAP.keys + ::CollectionObject::EXTENSION_FIELDS, # TODO: add other maps here
    trim_columns: true, # going to have to be optional
    trim_rows: false,
    header_converters: [:dwc_headers],
    copy_column: { from: 'occurrenceID', to: 'id' }
  )
end

#extension_computed_fields_data(methods) ⇒ Object

TODO: return, or optimize to this when ::CollectionObject::EXTENSION_COMPUTED_FIELDS.size > 1 def extension_computed_fields_data(methods)

d = []
collection_objects.find_each do |object|
  methods.each_pair { |method, name| d  << [object.id, name, object.send(method)] }
end
d

end

!! This will have to be reverted to above when > 1 EXTENSION field is present



232
233
234
235
236
237
238
239
240
241
242
243
244
245
# File 'lib/export/dwca/data.rb', line 232

def extension_computed_fields_data(methods)
  return [] if methods.empty?

  a = "TW:Internal:otu_name".freeze

  # n = "COALESCE( otus.name, TRIM(CONCAT(cached, ' ', cached_author_year))) as otu_name"

  v = collection_objects.left_joins(otu: [:taxon_name])
    .select("collection_objects.id, otus.name as otu_name")
    .where(taxon_determinations: {position: '1'})
    .find_each(batch_size: 10000)
    .collect{|r| [r.id, a, r['otu_name'].presence] }
  v
end

#media_tmpObject



646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
# File 'lib/export/dwca/data.rb', line 646

def media_tmp
  return nil if media_extension.nil? || media_extension.empty?
  @media_tmp = Tempfile.new('media.xml')

  content = nil
  if no_records?
    content = "\n"
  else
    content = Export::CSV::Dwc::Extension::Media.csv(media_extension[:collection_objects], media_extension[:field_occurrences])
  end

  @media_tmp.write(content)
  @media_tmp.flush
  @media_tmp.rewind
  @media_tmp
end

#meta_fieldsArray

non-standard DwC colums are handled elsewhere

Returns:

  • (Array)

    use the temporarily written, and refined, CSV file to read off the existing headers so we can use them in writing meta.yml



667
668
669
670
671
672
# File 'lib/export/dwca/data.rb', line 667

def meta_fields
  return [] if no_records?
  h = File.open(all_data, &:gets)&.strip&.split("\t")
  h&.shift # shift because the first column, id, will be specified by hand
  h || []
end

#no_records?Boolean

Returns true if provided core_scope returns no records.

Returns:

  • (Boolean)

    true if provided core_scope returns no records



186
187
188
# File 'lib/export/dwca/data.rb', line 186

def no_records?
  total == 0
end

#package_download(download) ⇒ Object

Parameters:



814
815
816
817
818
819
# File 'lib/export/dwca/data.rb', line 814

def package_download(download)
  p = zipfile.path

  # This doesn't touch the db (source_file_path is an instance var).
  download.update!(source_file_path: p)
end

#predicate_options_present?Boolean

Returns:

  • (Boolean)


157
158
159
# File 'lib/export/dwca/data.rb', line 157

def predicate_options_present?
  data_predicate_ids[:collection_object_predicate_id].present? || data_predicate_ids[:collecting_event_predicate_id].present?
end

#taxonworks_options_present?Boolean

Returns:

  • (Boolean)


161
162
163
# File 'lib/export/dwca/data.rb', line 161

def taxonworks_options_present?
  taxonworks_extension_methods.present?
end

#used_collecting_event_predicatesObject



471
472
473
474
475
# File 'lib/export/dwca/data.rb', line 471

def used_collecting_event_predicates
  collecting_event_attributes_query.joins(:predicate).select("CONCAT('TW:DataAttribute:CollectingEvent:', controlled_vocabulary_terms.name) predicate_name")
    .distinct
    .collect{|r| r['predicate_name']}
end

#used_collection_object_predicatesObject



465
466
467
468
469
# File 'lib/export/dwca/data.rb', line 465

def used_collection_object_predicates
  collection_object_attributes_query.select("CONCAT('TW:DataAttribute:CollectionObject:', controlled_vocabulary_terms.name) predicate_name")
    .distinct
    .collect{|r| r['predicate_name']}
end

#used_predicatesArray

Returns of distinct Predicate names in the format

`TW:DataAttribute:<CollectingEvent|CollectionObject>:<name>`.

Returns:

  • (Array)

    of distinct Predicate names in the format

    `TW:DataAttribute:<CollectingEvent|CollectionObject>:<name>`
    


480
481
482
# File 'lib/export/dwca/data.rb', line 480

def used_predicates
  used_collection_object_predicates + used_collecting_event_predicates
end