Class: Export::Dwca::Occurrence::TaxonworksExtensionExporter
- Inherits:
-
Object
- Object
- Export::Dwca::Occurrence::TaxonworksExtensionExporter
- Defined in:
- lib/export/dwca/occurrence/taxonworks_extension_exporter.rb
Overview
Service object for exporting TaxonWorks custom fields to DwCA.
Instance Method Summary collapse
-
#build_query_joins(field_data) ⇒ ActiveRecord::Relation
private
Builds query with necessary joins based on field types.
-
#build_select_columns(field_data) ⇒ Array<String>
private
Builds SELECT clause columns with proper aliasing.
-
#classify_extension_fields ⇒ Hash
private
Classifies extension fields into their source types and builds metadata.
- #collection_object_scope ⇒ Object private
-
#export_to(output_file) ⇒ Tempfile
Main export method - writes TaxonWorks extension data to output file.
-
#extension_data_query_data ⇒ Hash
private
Builds the SQL query and metadata needed for taxonworks_extension_data export.
-
#initialize(core_scope:, taxonworks_extension_methods: []) ⇒ TaxonworksExtensionExporter
constructor
A new instance of TaxonworksExtensionExporter.
Constructor Details
#initialize(core_scope:, taxonworks_extension_methods: []) ⇒ TaxonworksExtensionExporter
Returns a new instance of TaxonworksExtensionExporter.
8 9 10 11 |
# File 'lib/export/dwca/occurrence/taxonworks_extension_exporter.rb', line 8 def initialize(core_scope:, taxonworks_extension_methods: []) @core_scope = core_scope @taxonworks_extension_methods = taxonworks_extension_methods end |
Instance Method Details
#build_query_joins(field_data) ⇒ ActiveRecord::Relation (private)
Builds query with necessary joins based on field types.
172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
# File 'lib/export/dwca/occurrence/taxonworks_extension_exporter.rb', line 172 def build_query_joins(field_data) ce_columns = field_data[:ce_columns] methods = field_data[:methods] query = collection_object_scope .joins('JOIN collection_objects ON collection_objects.id = dwc_occurrences.dwc_occurrence_object_id') if ce_columns.any? query = query.joins('LEFT JOIN collecting_events ON collecting_events.id = collection_objects.collecting_event_id') end if methods.keys.include?(:otu_name) # TODO: remove this when the issue is fixed! # !! Guarding against a given otu with > 1 position=1 taxon # determinations (which, at time of writing, was actually happening). query = query.joins(<<~SQL.squish) LEFT JOIN LATERAL ( SELECT td.otu_id FROM taxon_determinations td WHERE td.taxon_determination_object_id = collection_objects.id AND td.taxon_determination_object_type = 'CollectionObject' AND td.position = 1 ORDER BY td.id DESC LIMIT 1 ) td1 ON true LEFT JOIN otus ON otus.id = td1.otu_id SQL end query end |
#build_select_columns(field_data) ⇒ Array<String> (private)
Builds SELECT clause columns with proper aliasing.
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 |
# File 'lib/export/dwca/occurrence/taxonworks_extension_exporter.rb', line 207 def build_select_columns(field_data) methods = field_data[:methods] ce_columns = field_data[:ce_columns] co_columns = field_data[:co_columns] dwco_columns = field_data[:dwco_columns] select_cols = ['dwc_occurrences.id'] # Computed fields if methods.keys.include?(:otu_name) select_cols << 'otus.name AS otu_name' end # CE fields - map virtual :id column to collecting_event_id select_cols += ce_columns.map { |col| col == :collecting_event_id ? "collecting_events.id AS collecting_event_id" : "collecting_events.#{col}" } if ce_columns.any? # CO fields - map virtual :id column to collection_object_id select_cols += co_columns.map { |col| col == :id ? "collection_objects.id AS collection_object_id" : "collection_objects.#{col}" } if co_columns.any? # DWCO fields - map virtual :id column to dwc_occurrence_id if dwco_columns.include?(:id) select_cols << 'dwc_occurrences.id AS dwc_occurrence_id' select_cols += dwco_columns.reject { |col| col == :id }.map { |col| "dwc_occurrences.#{col}" } else select_cols += dwco_columns.map { |col| "dwc_occurrences.#{col}" } if dwco_columns.any? end select_cols end |
#classify_extension_fields ⇒ Hash (private)
Classifies extension fields into their source types and builds metadata.
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
# File 'lib/export/dwca/occurrence/taxonworks_extension_exporter.rb', line 120 def classify_extension_fields methods = {} ce_fields = {} co_fields = {} dwco_fields = {} column_data = [] used_extensions = [] @taxonworks_extension_methods.map(&:to_sym).each do |sym| csv_header_name = ('TW:Internal:' + sym.to_s).freeze if (method = ::CollectionObject::EXTENSION_COMPUTED_FIELDS[sym]) methods[method] = csv_header_name column_data << [:method, method] used_extensions << csv_header_name elsif (column_name = ::CollectionObject::EXTENSION_CE_FIELDS[sym]) ce_fields[column_name] = csv_header_name column_data << [:ce, column_name] used_extensions << csv_header_name elsif (column_name = ::CollectionObject::EXTENSION_CO_FIELDS[sym]) co_fields[column_name] = csv_header_name column_data << [:co, column_name] used_extensions << csv_header_name elsif (column_name = ::CollectionObject::EXTENSION_DWC_OCCURRENCE_FIELDS[sym]) dwco_fields[column_name] = csv_header_name column_data << [:dwco, column_name] used_extensions << csv_header_name end end # Extract column arrays for query building (preserve requested order). co_columns = co_fields.keys ce_columns = ce_fields.keys # map virtual :id to :collecting_event_id if (idx = ce_columns.index(:id)) ce_columns[idx] = :collecting_event_id end dwco_columns = dwco_fields.keys { methods:, ce_columns:, co_columns:, dwco_columns:, column_data:, used_extensions: } end |
#collection_object_scope ⇒ Object (private)
96 97 98 |
# File 'lib/export/dwca/occurrence/taxonworks_extension_exporter.rb', line 96 def collection_object_scope @collection_object_scope ||= @core_scope.where(dwc_occurrence_object_type: 'CollectionObject') end |
#export_to(output_file) ⇒ Tempfile
Main export method - writes TaxonWorks extension data to output file.
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# File 'lib/export/dwca/occurrence/taxonworks_extension_exporter.rb', line 16 def export_to(output_file) data = extension_data_query_data base_query = data[:query] column_data = data[:column_data] used_extensions = data[:used_extensions] if used_extensions.empty? || !collection_object_scope.exists? Rails.logger.debug 'dwca_export: taxonworks_extension_data prepared - ' + (used_extensions.empty? ? 'no extensions' : 'no collection objects') return output_file end csv = ::CSV.new(output_file, col_sep: "\t") csv << used_extensions conn = ActiveRecord::Base.connection batch_size = 50_000 last_id = nil loop do # IMPORTANT: # - We re-apply a deterministic order here. # - We filter by id > last_id to keep batches stable and non-overlapping. # - We limit the batch size to stream results. batch_rel = base_query .reorder('dwc_occurrences.id ASC') .limit(batch_size) batch_rel = if last_id batch_rel.where('dwc_occurrences.id > ?', last_id) else batch_rel end result = conn.select_all(batch_rel.to_sql) break if result.empty? result.each do |row| output_row = [] # Iterate over column_data to guarantee column order matches used_extensions. column_data.each do |source_type, col| v = case source_type when :method row[col.to_s] when :ce attr_name = (col == :id ? 'collecting_event_id' : col.to_s) row[attr_name] when :co attr_name = (col == :id ? 'collection_object_id' : col.to_s) row[attr_name] when :dwco attr_name = (col == :id ? 'dwc_occurrence_id' : col.to_s) row[attr_name] end output_row << (v.nil? ? nil : Utilities::Strings.sanitize_for_csv(v.to_s)) end csv << output_row end # last_id must come from the actual selected "dwc_occurrences.id". # Since select_cols starts with 'dwc_occurrences.id', the key should be "id". last_id = result.last['id'] end Rails.logger.debug 'dwca_export: extension data written' csv.flush Rails.logger.debug 'dwca_export: taxonworks_extension_data prepared' output_file ensure output_file.flush output_file.rewind end |
#extension_data_query_data ⇒ Hash (private)
Builds the SQL query and metadata needed for taxonworks_extension_data export.
105 106 107 108 109 110 111 112 113 114 115 116 |
# File 'lib/export/dwca/occurrence/taxonworks_extension_exporter.rb', line 105 def extension_data_query_data field_data = classify_extension_fields query = build_query_joins(field_data) select_cols = build_select_columns(field_data) { query: query.select(select_cols), column_data: field_data[:column_data], used_extensions: field_data[:used_extensions] } end |