Class: Export::Dwca::Occurrence::TaxonworksExtensionExporter

Inherits:
Object
  • Object
show all
Defined in:
lib/export/dwca/occurrence/taxonworks_extension_exporter.rb

Overview

Service object for exporting TaxonWorks custom fields to DwCA.

Instance Method Summary collapse

Constructor Details

#initialize(core_scope:, taxonworks_extension_methods: []) ⇒ TaxonworksExtensionExporter

Returns a new instance of TaxonworksExtensionExporter.

Parameters:

  • core_scope (ActiveRecord::Relation)

    DwcOccurrence scope

  • taxonworks_extension_methods (Array<String, Symbol>) (defaults to: [])

    extension field names



8
9
10
11
# File 'lib/export/dwca/occurrence/taxonworks_extension_exporter.rb', line 8

def initialize(core_scope:, taxonworks_extension_methods: [])
  @core_scope = core_scope
  @taxonworks_extension_methods = taxonworks_extension_methods
end

Instance Method Details

#build_query_joins(field_data) ⇒ ActiveRecord::Relation (private)

Builds query with necessary joins based on field types.

Parameters:

  • field_data (Hash)

    field classification data from classify_extension_fields

Returns:

  • (ActiveRecord::Relation)

    query with joins applied



172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# File 'lib/export/dwca/occurrence/taxonworks_extension_exporter.rb', line 172

def build_query_joins(field_data)
  ce_columns = field_data[:ce_columns]
  methods = field_data[:methods]

  query = collection_object_scope
    .joins('JOIN collection_objects ON collection_objects.id = dwc_occurrences.dwc_occurrence_object_id')

  if ce_columns.any?
    query = query.joins('LEFT JOIN collecting_events ON collecting_events.id = collection_objects.collecting_event_id')
  end

  if methods.keys.include?(:otu_name)
    # TODO: remove this when the issue is fixed!
    # !! Guarding against a given otu with > 1 position=1 taxon
    # determinations (which, at time of writing, was actually happening).
    query = query.joins(<<~SQL.squish)
      LEFT JOIN LATERAL (
        SELECT td.otu_id
        FROM taxon_determinations td
        WHERE td.taxon_determination_object_id = collection_objects.id
          AND td.taxon_determination_object_type = 'CollectionObject'
          AND td.position = 1
        ORDER BY td.id DESC
        LIMIT 1
      ) td1 ON true
      LEFT JOIN otus ON otus.id = td1.otu_id
    SQL
  end

  query
end

#build_select_columns(field_data) ⇒ Array<String> (private)

Builds SELECT clause columns with proper aliasing.

Parameters:

  • field_data (Hash)

    field classification data from classify_extension_fields

Returns:

  • (Array<String>)

    SELECT column specifications



207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# File 'lib/export/dwca/occurrence/taxonworks_extension_exporter.rb', line 207

def build_select_columns(field_data)
  methods = field_data[:methods]
  ce_columns = field_data[:ce_columns]
  co_columns = field_data[:co_columns]
  dwco_columns = field_data[:dwco_columns]

  select_cols = ['dwc_occurrences.id']

  # Computed fields
  if methods.keys.include?(:otu_name)
    select_cols << 'otus.name AS otu_name'
  end

  # CE fields - map virtual :id column to collecting_event_id
  select_cols += ce_columns.map { |col| col == :collecting_event_id ? "collecting_events.id AS collecting_event_id" : "collecting_events.#{col}" } if ce_columns.any?

  # CO fields - map virtual :id column to collection_object_id
  select_cols += co_columns.map { |col| col == :id ? "collection_objects.id AS collection_object_id" : "collection_objects.#{col}" } if co_columns.any?

  # DWCO fields - map virtual :id column to dwc_occurrence_id
  if dwco_columns.include?(:id)
    select_cols << 'dwc_occurrences.id AS dwc_occurrence_id'
    select_cols += dwco_columns.reject { |col| col == :id }.map { |col| "dwc_occurrences.#{col}" }
  else
    select_cols += dwco_columns.map { |col| "dwc_occurrences.#{col}" } if dwco_columns.any?
  end

  select_cols
end

#classify_extension_fieldsHash (private)

Classifies extension fields into their source types and builds metadata.

Returns:

  • (Hash)

    with field classifications and metadata arrays



120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# File 'lib/export/dwca/occurrence/taxonworks_extension_exporter.rb', line 120

def classify_extension_fields
  methods = {}
  ce_fields = {}
  co_fields = {}
  dwco_fields = {}
  column_data = []
  used_extensions = []

  @taxonworks_extension_methods.map(&:to_sym).each do |sym|
    csv_header_name = ('TW:Internal:' + sym.to_s).freeze

    if (method = ::CollectionObject::EXTENSION_COMPUTED_FIELDS[sym])
      methods[method] = csv_header_name
      column_data << [:method, method]
      used_extensions << csv_header_name
    elsif (column_name = ::CollectionObject::EXTENSION_CE_FIELDS[sym])
      ce_fields[column_name] = csv_header_name
      column_data << [:ce, column_name]
      used_extensions << csv_header_name
    elsif (column_name = ::CollectionObject::EXTENSION_CO_FIELDS[sym])
      co_fields[column_name] = csv_header_name
      column_data << [:co, column_name]
      used_extensions << csv_header_name
    elsif (column_name = ::CollectionObject::EXTENSION_DWC_OCCURRENCE_FIELDS[sym])
      dwco_fields[column_name] = csv_header_name
      column_data << [:dwco, column_name]
      used_extensions << csv_header_name
    end
  end

  # Extract column arrays for query building (preserve requested order).
  co_columns = co_fields.keys
  ce_columns = ce_fields.keys
  # map virtual :id to :collecting_event_id
  if (idx = ce_columns.index(:id))
    ce_columns[idx] = :collecting_event_id
  end
  dwco_columns = dwco_fields.keys

  {
    methods:,
    ce_columns:,
    co_columns:,
    dwco_columns:,
    column_data:,
    used_extensions:
  }
end

#collection_object_scopeObject (private)



96
97
98
# File 'lib/export/dwca/occurrence/taxonworks_extension_exporter.rb', line 96

def collection_object_scope
  @collection_object_scope ||= @core_scope.where(dwc_occurrence_object_type: 'CollectionObject')
end

#export_to(output_file) ⇒ Tempfile

Main export method - writes TaxonWorks extension data to output file.

Parameters:

  • output_file (Tempfile, File)

    output file for extension TSV data

Returns:

  • (Tempfile)

    the output file



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/export/dwca/occurrence/taxonworks_extension_exporter.rb', line 16

def export_to(output_file)
  data = extension_data_query_data
  base_query = data[:query]
  column_data = data[:column_data]
  used_extensions = data[:used_extensions]

  if used_extensions.empty? || !collection_object_scope.exists?
    Rails.logger.debug 'dwca_export: taxonworks_extension_data prepared - ' + (used_extensions.empty? ? 'no extensions' : 'no collection objects')
    return output_file
  end

  csv = ::CSV.new(output_file, col_sep: "\t")
  csv << used_extensions

  conn = ActiveRecord::Base.connection

  batch_size = 50_000
  last_id = nil

  loop do
    # IMPORTANT:
    # - We re-apply a deterministic order here.
    # - We filter by id > last_id to keep batches stable and non-overlapping.
    # - We limit the batch size to stream results.
    batch_rel = base_query
      .reorder('dwc_occurrences.id ASC')
      .limit(batch_size)

    batch_rel =
      if last_id
        batch_rel.where('dwc_occurrences.id > ?', last_id)
      else
        batch_rel
      end

    result = conn.select_all(batch_rel.to_sql)
    break if result.empty?

    result.each do |row|
      output_row = []

      # Iterate over column_data to guarantee column order matches used_extensions.
      column_data.each do |source_type, col|
        v = case source_type
        when :method
          row[col.to_s]
        when :ce
          attr_name = (col == :id ? 'collecting_event_id' : col.to_s)
          row[attr_name]
        when :co
          attr_name = (col == :id ? 'collection_object_id' : col.to_s)
          row[attr_name]
        when :dwco
          attr_name = (col == :id ? 'dwc_occurrence_id' : col.to_s)
          row[attr_name]
        end

        output_row << (v.nil? ? nil : Utilities::Strings.sanitize_for_csv(v.to_s))
      end

      csv << output_row
    end

    # last_id must come from the actual selected "dwc_occurrences.id".
    # Since select_cols starts with 'dwc_occurrences.id', the key should be "id".
    last_id = result.last['id']
  end

  Rails.logger.debug 'dwca_export: extension data written'
  csv.flush
  Rails.logger.debug 'dwca_export: taxonworks_extension_data prepared'

  output_file
ensure
  output_file.flush
  output_file.rewind
end

#extension_data_query_dataHash (private)

Builds the SQL query and metadata needed for taxonworks_extension_data export.

Returns:

  • (Hash)

    with keys: :query - ActiveRecord::Relation with all needed joins and select columns :column_data - array of [column_source_type, column_or_method] in CSV order :used_extensions - array of CSV header names in output order



105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/export/dwca/occurrence/taxonworks_extension_exporter.rb', line 105

def extension_data_query_data
  field_data = classify_extension_fields

  query = build_query_joins(field_data)
  select_cols = build_select_columns(field_data)

  {
    query: query.select(select_cols),
    column_data: field_data[:column_data],
    used_extensions: field_data[:used_extensions]
  }
end