Class: Export::Dwca::Data

Inherits:
Object
  • Object
show all
Defined in:
lib/export/dwca/data.rb

Overview

!! !! This export does not support AssertedDistribution data at the moment. While those data are indexed, !! if they are in the ‘core_scope` they will almost certainly cause problems or be ignored. !!

Wrapper to build DWCA zipfiles for a specific project. See tasks/accesssions/report/dwc_controller.rb for use.

With help from thinkingeek.com/2013/11/15/create-temporary-zip-file-send-response-rails/

Usage:

begin
 data = Dwca::Data.new(DwcOccurrence.where(project_id: sessions_current_project_id)
ensure
 data.cleanup
end

Always use the ensure/data.cleanup pattern!

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(core_scope: nil, extension_scopes: {}, predicate_extensions: {}, taxonworks_extensions: []) ⇒ Data

Returns a new instance of Data.

Parameters:

  • taxonworks_extensions (Array<Symbol>) (defaults to: [])

    List of methods to perform on each CO

Raises:

  • (ArgumentError)


82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/export/dwca/data.rb', line 82

def initialize(core_scope: nil, extension_scopes: {}, predicate_extensions: {}, taxonworks_extensions: [])
  raise ArgumentError, 'must pass a core_scope' if core_scope.nil?

  @core_scope = core_scope

  @biological_associations_extension = extension_scopes[:biological_associations] #! Hash with keys core_params, collection_objects_query
  @media_extension = extension_scopes[:media] #! Hash with keys collection_objects, field_occurrences

  @data_predicate_ids = { collection_object_predicate_id: [], collecting_event_predicate_id: [] }.merge(predicate_extensions)

  @taxonworks_extension_methods = taxonworks_extensions
end

Instance Attribute Details

#all_dataObject

Returns Tempfile.

Returns:

  • Tempfile



75
76
77
# File 'lib/export/dwca/data.rb', line 75

def all_data
  @all_data
end

#biological_associations_extensionScope?

Returning BiologicalAssociation

Returns:

  • (Scope, nil)

    Returning BiologicalAssociation



48
49
50
# File 'lib/export/dwca/data.rb', line 48

def biological_associations_extension
  @biological_associations_extension
end

#collection_object_idsObject

TODO Breaks when AssertedDistribution is added



67
68
69
# File 'lib/export/dwca/data.rb', line 67

def collection_object_ids
  @collection_object_ids
end

#core_scopeObject

!params core_scope [String, ActiveRecord::Relation]

String is fully formed SQL


44
45
46
# File 'lib/export/dwca/data.rb', line 44

def core_scope
  @core_scope
end

#dataTempfile

Returns the csv data as a tempfile.

Returns:

  • (Tempfile)

    the csv data as a tempfile



187
188
189
# File 'lib/export/dwca/data.rb', line 187

def data
  @data
end

#data_predicate_idsObject

collection_object_predicate_id: [], collecting_event_predicate_id: []

Returns:

  • Hash



62
63
64
# File 'lib/export/dwca/data.rb', line 62

def data_predicate_ids
  @data_predicate_ids
end

#dwc_id_orderObject

Get order of ids that matches core records so we can align with csv zero! Like 2=>1, 3=>2, 4=>3, 5=>4

Returns:

  • Hash



79
80
81
# File 'lib/export/dwca/data.rb', line 79

def dwc_id_order
  @dwc_id_order
end

#emlTempfile

This is a stub, and only half-heartedly done. You should be using IPT for the time being. See also

https://github.com/gbif/ipt/wiki/
https://github.com/gbif/ipt/wiki/#exemplar-datasets

TODO: reference biological_resource_extension.csv

Returns:

  • (Tempfile)

    metadata about this dataset



569
570
571
# File 'lib/export/dwca/data.rb', line 569

def eml
  @eml
end

#filenameString (readonly)

the name of zipfile

Returns:

  • (String)


819
820
821
# File 'lib/export/dwca/data.rb', line 819

def filename
  @filename
end

#media_extensionScope?

Returns @return Image(?).

Returns:

  • (Scope, nil)

    @return Image(?)



52
53
54
# File 'lib/export/dwca/data.rb', line 52

def media_extension
  @media_extension
end

#metaObject

Returns the value of attribute meta.



38
39
40
# File 'lib/export/dwca/data.rb', line 38

def meta
  @meta
end

#predicate_dataObject

Returns the value of attribute predicate_data.



58
59
60
# File 'lib/export/dwca/data.rb', line 58

def predicate_data
  @predicate_data
end

#taxonworks_extension_dataObject

rubocop:disable Metrics/MethodLength



244
245
246
# File 'lib/export/dwca/data.rb', line 244

def taxonworks_extension_data
  @taxonworks_extension_data
end

#taxonworks_extension_methodsObject

Returns the value of attribute taxonworks_extension_methods.



72
73
74
# File 'lib/export/dwca/data.rb', line 72

def taxonworks_extension_methods
  @taxonworks_extension_methods
end

#totalObject

TODO update



54
55
56
# File 'lib/export/dwca/data.rb', line 54

def total
  @total
end

#zipfileTempfile

Returns the zipfile.

Returns:

  • (Tempfile)

    the zipfile



810
811
812
# File 'lib/export/dwca/data.rb', line 810

def zipfile
  @zipfile
end

Instance Method Details

#biological_association_relations_to_coreObject



659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
# File 'lib/export/dwca/data.rb', line 659

def biological_association_relations_to_core
  core_params = {
    dwc_occurrence_query: @biological_associations_extension[:core_params]
  }

  subject_biological_associations =
    ::Queries::BiologicalAssociation::Filter.new(
      collection_object_query: core_params,
      collection_object_as_subject_or_as_object: :subject
    ).all

  object_biological_associations =
    ::Queries::BiologicalAssociation::Filter.new(
      collection_object_query: core_params,
      collection_object_as_subject_or_as_object: :object
    ).all

  {
    subject: Set.new(subject_biological_associations.pluck(:id)),
    object: Set.new(object_biological_associations.pluck(:id))
  }
end

#biological_associations_resource_relationshipObject



682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
# File 'lib/export/dwca/data.rb', line 682

def biological_associations_resource_relationship
  return nil if biological_associations_extension.nil?
  @biological_associations_resource_relationship = Tempfile.new('biological_resource_relationship.xml')

  content = nil

  if no_records?
    content = "\n"
  else
    content = Export::CSV::Dwc::Extension::BiologicalAssociations.csv(biological_associations_extension, biological_association_relations_to_core)
  end

  @biological_associations_resource_relationship.write(content)
  @biological_associations_resource_relationship.flush
  @biological_associations_resource_relationship.rewind
  @biological_associations_resource_relationship
end

#build_zipObject



791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
# File 'lib/export/dwca/data.rb', line 791

def build_zip
  t = Tempfile.new(filename)

  Zip::OutputStream.open(t) { |zos| }

  Zip::File.open(t.path, Zip::File::CREATE) do |zip|
    zip.add('data.tsv', all_data.path)

    zip.add('media.tsv', media_resource_relationship.path) if media_extension
    zip.add('resource_relationships.tsv', biological_associations_resource_relationship.path) if biological_associations_extension

    zip.add('meta.xml', meta.path)
    zip.add('eml.xml', eml.path)
  end
  t
end

#cleanupTrue

Returns close and delete all temporary files.

Returns:

  • (True)

    close and delete all temporary files



826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
# File 'lib/export/dwca/data.rb', line 826

def cleanup

  Rails.logger.debug 'dwca_export: cleanup start'

  zipfile.close
  zipfile.unlink
  meta.close
  meta.unlink
  eml.close
  eml.unlink
  data.close
  data.unlink

  if biological_associations_extension
    biological_associations_resource_relationship.close
    biological_associations_resource_relationship.unlink
  end

  if predicate_options_present?
    predicate_data.close
    predicate_data.unlink
  end

  if taxonworks_options_present?
    taxonworks_extension_data.close
    taxonworks_extension_data.unlink
  end

  all_data.close
  all_data.unlink

  Rails.logger.debug 'dwca_export: cleanup end'

  true
end

#collecting_event_attributesObject

@return Array

1 row per CO per DA (type) on CE


413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
# File 'lib/export/dwca/data.rb', line 413

def collecting_event_attributes
  q = "WITH relevant_collection_objects AS (
      #{collection_objects.unscope(:order).select(:id, :collecting_event_id).to_sql}
  )

  SELECT
      relevant_collection_objects.id AS co_id,
      CONCAT('TW:DataAttribute:CollectingEvent:', cvt.name) AS predicate,
      da.value
  FROM
      data_attributes da
      JOIN collecting_events ce ON ce.id = da.attribute_subject_id
           AND da.attribute_subject_type = 'CollectingEvent'
           AND da.type = 'InternalAttribute'
      LEFT JOIN relevant_collection_objects ON ce.id = relevant_collection_objects.collecting_event_id
      JOIN controlled_vocabulary_terms cvt ON cvt.id = da.controlled_vocabulary_term_id
          AND cvt.type = 'Predicate'
  WHERE relevant_collection_objects.id IS NOT null"

  q = q + " AND da.controlled_vocabulary_term_id IN (#{collecting_event_predicate_ids.join(',')})" if collecting_event_predicate_ids.any?

  DataAttribute.connection.execute( q ).collect{|r| [r['co_id'], r['predicate'], r['value']] }
end

#collecting_event_attributes_queryObject

Returns Relation the unique attributes derived from CollectingEvents.

Returns:

  • Relation the unique attributes derived from CollectingEvents



402
403
404
405
406
407
408
409
# File 'lib/export/dwca/data.rb', line 402

def collecting_event_attributes_query
  s = 'WITH touched_collecting_events AS (' + collecting_events.to_sql + ') ' + ::InternalAttribute
    .joins("JOIN touched_collecting_events as tce1 on data_attributes.attribute_subject_id = tce1.id AND data_attributes.attribute_subject_type = 'CollectingEvent'")
    .where(controlled_vocabulary_term_id: collecting_event_predicate_ids)
    .to_sql

  ::InternalAttribute.from('(' + s + ') as data_attributes')
end

#collecting_event_predicate_idsObject



113
114
115
# File 'lib/export/dwca/data.rb', line 113

def collecting_event_predicate_ids
  @data_predicate_ids[:collecting_event_predicate_id]
end

#collecting_eventsObject

rubocop:enable Metrics/MethodLength



360
361
362
363
364
365
366
367
# File 'lib/export/dwca/data.rb', line 360

def collecting_events
  s = 'WITH co_scoped AS (' + collection_objects.unscope(:order).select(:id, :collecting_event_id).to_sql + ') ' + ::CollectingEvent
    .joins('JOIN co_scoped as co_scoped1 on co_scoped1.collecting_event_id = collecting_events.id')
    .distinct
    .to_sql

  ::CollectingEvent.from('(' + s + ') as collecting_events')
end

#collection_object_attributesObject



380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
# File 'lib/export/dwca/data.rb', line 380

def collection_object_attributes
  q = "WITH relevant_collection_objects AS (
      #{collection_objects.unscope(:order).select(:id).to_sql}
  )
  SELECT da.id, da.attribute_subject_id,
         CONCAT('TW:DataAttribute:CollectionObject:', cvt.name) AS predicate,
         da.value,
         da.controlled_vocabulary_term_id
  FROM data_attributes da
  JOIN relevant_collection_objects rco ON da.attribute_subject_id = rco.id
                                       AND da.attribute_subject_type = 'CollectionObject'
  JOIN controlled_vocabulary_terms cvt ON cvt.id = da.controlled_vocabulary_term_id
                                       AND cvt.type = 'Predicate'
  WHERE da.type = 'InternalAttribute'"

  q = q + " AND da.controlled_vocabulary_term_id IN (#{collection_object_predicate_ids.join(',')})" if collection_object_predicate_ids.any?

  DataAttribute.connection.execute( q ).collect{|r| [r['attribute_subject_id'], r['predicate'], r['value']] }
end

#collection_object_attributes_queryObject



369
370
371
372
373
374
375
376
377
378
# File 'lib/export/dwca/data.rb', line 369

def collection_object_attributes_query
  s = 'WITH touched_collection_objects AS (' + collection_objects.unscope(:order).select(:id).to_sql + ') ' + ::InternalAttribute
    .joins("JOIN touched_collection_objects as tco1 on data_attributes.attribute_subject_id = tco1.id AND data_attributes.attribute_subject_type = 'CollectionObject'")
    .to_sql

  ::InternalAttribute
    .joins(:predicate)
    .where(controlled_vocabulary_term_id: collection_object_predicate_ids)
    .from('(' + s + ') as data_attributes')
end

#collection_object_predicate_idsObject



109
110
111
# File 'lib/export/dwca/data.rb', line 109

def collection_object_predicate_ids
  @data_predicate_ids[:collection_object_predicate_id]
end

#collection_objectsObject



437
438
439
440
441
442
443
444
# File 'lib/export/dwca/data.rb', line 437

def collection_objects
  s = 'WITH dwc_scoped AS (' + core_scope.unscope(:order).select('dwc_occurrences.dwc_occurrence_object_id, dwc_occurrences.dwc_occurrence_object_type').to_sql + ') ' + ::CollectionObject
    .joins("JOIN dwc_scoped as dwc_scoped1 on dwc_scoped1.dwc_occurrence_object_id = collection_objects.id and dwc_scoped1.dwc_occurrence_object_type = 'CollectionObject'")
    .select(:id, :collecting_event_id, :type)
    .to_sql

  ::CollectionObject.from('(' + s + ') as collection_objects')
end

#csvCSV

Returns the data as a CSV object.

Returns:

  • (CSV)

    the data as a CSV object



167
168
169
170
171
172
173
174
175
176
177
# File 'lib/export/dwca/data.rb', line 167

def csv
  ::Export::CSV.generate_csv(
    core_scope.computed_columns,
    # TODO: check to see if we nee dthis
    exclude_columns: ::DwcOccurrence.excluded_columns,
    column_order: ::CollectionObject::DWC_OCCURRENCE_MAP.keys + ::CollectionObject::EXTENSION_FIELDS, # TODO: add other maps here
    trim_columns: true, # going to have to be optional
    trim_rows: false,
    header_converters: [:dwc_headers]
  )
end

#extension_computed_fields_data(methods) ⇒ Object

TODO: return, or optimize to this when ::CollectionObject::EXTENSION_COMPUTED_FIELDS.size > 1 def extension_computed_fields_data(methods)

d = []
collection_objects.find_each do |object|
  methods.each_pair { |method, name| d  << [object.id, name, object.send(method)] }
end
d

end

!! This will have to be reverted to above when > 1 EXTENSION field is present



227
228
229
230
231
232
233
234
235
236
237
238
239
240
# File 'lib/export/dwca/data.rb', line 227

def extension_computed_fields_data(methods)
  return [] if methods.empty?

  a = "TW:Internal:otu_name".freeze

  # n = "COALESCE( otus.name, TRIM(CONCAT(cached, ' ', cached_author_year))) as otu_name"

  v = collection_objects.left_joins(otu: [:taxon_name])
    .select("collection_objects.id, otus.name as otu_name")
    .where(taxon_determinations: {position: '1'})
    .find_each(batch_size: 10000)
    .collect{|r| [r.id, a, r['otu_name'].presence] }
  v
end

#media_resource_relationshipObject



700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
# File 'lib/export/dwca/data.rb', line 700

def media_resource_relationship
  return nil if media_extension.nil? || media_extension.empty?
  @media_resource_relationship = Tempfile.new('media_relationship.xml')

  content = nil
  if no_records?
    content = "\n"
  else
    content = Export::CSV::Dwc::Extension::Media.csv(media_extension[:collection_objects], media_extension[:field_occurrences])
  end

  @media_resource_relationship.write(content)
  @media_resource_relationship.flush
  @media_resource_relationship.rewind
  @media_resource_relationship
end

#meta_fieldsArray

id, and non-standard DwC colums are handled elsewhere

Returns:

  • (Array)

    use the temporarily written, and refined, CSV file to read off the existing headers so we can use them in writing meta.yml



721
722
723
724
725
726
# File 'lib/export/dwca/data.rb', line 721

def meta_fields
  return [] if no_records?
  h = File.open(all_data, &:gets)&.strip&.split("\t")
  h&.shift
  h || []
end

#no_records?Boolean

Returns true if provided core_scope returns no records.

Returns:

  • (Boolean)

    true if provided core_scope returns no records



181
182
183
# File 'lib/export/dwca/data.rb', line 181

def no_records?
  total == 0
end

#package_download(download) ⇒ Download

Returns a download instance.

Parameters:

Returns:



864
865
866
867
# File 'lib/export/dwca/data.rb', line 864

def package_download(download)
  download.update!(source_file_path: zipfile.path)
  download
end

#predicate_options_present?Boolean

Returns:

  • (Boolean)


153
154
155
# File 'lib/export/dwca/data.rb', line 153

def predicate_options_present?
  data_predicate_ids[:collection_object_predicate_id].present? || data_predicate_ids[:collecting_event_predicate_id].present?
end

#taxonworks_options_present?Boolean

Returns:

  • (Boolean)


157
158
159
# File 'lib/export/dwca/data.rb', line 157

def taxonworks_options_present?
  taxonworks_extension_methods.present?
end

#used_collecting_event_predicatesObject



452
453
454
455
456
# File 'lib/export/dwca/data.rb', line 452

def used_collecting_event_predicates
  collecting_event_attributes_query.joins(:predicate).select("CONCAT('TW:DataAttribute:CollectingEvent:', controlled_vocabulary_terms.name) predicate_name")
    .distinct
    .collect{|r| r['predicate_name']}
end

#used_collection_object_predicatesObject



446
447
448
449
450
# File 'lib/export/dwca/data.rb', line 446

def used_collection_object_predicates
  collection_object_attributes_query.select("CONCAT('TW:DataAttribute:CollectionObject:', controlled_vocabulary_terms.name) predicate_name")
    .distinct
    .collect{|r| r['predicate_name']}
end

#used_predicatesArray

Returns of distinct Predicate names in the format

`TW:DataAttribute:<CollectingEvent|CollectionObject>:<name>`.

Returns:

  • (Array)

    of distinct Predicate names in the format

    `TW:DataAttribute:<CollectingEvent|CollectionObject>:<name>`
    


461
462
463
# File 'lib/export/dwca/data.rb', line 461

def used_predicates
  used_collection_object_predicates + used_collecting_event_predicates
end