6

In our Rails 3.2.13 app (Ruby 2.0.0 + Postgres on Heroku), we are often retreiving a large amount of Order data from an API, and then we need to update or create each order in our database, as well as the associations. A single order creates/updates itself plus approx. 10-15 associcated objects, and we are importing up to 500 orders at a time.

The below code works, but the problem is it's not at all efficient in terms of speed. Creating/updating 500 records takes approx. 1 minute and generates 6500+ db queries!

def add_details(shop, shopify_orders)
  shopify_orders.each do |shopify_order|
    order = Order.where(:order_id => shopify_order.id.to_s, :shop_id => shop.id).first_or_create
    order.update_details(order,shopify_order,shop)  #This calls update_attributes for the Order
    ShippingLine.add_details(order, shopify_order.shipping_lines)
    LineItem.add_details(order, shopify_order.line_items)
    Taxline.add_details(order, shopify_order.tax_lines)
    Fulfillment.add_details(order, shopify_order.fulfillments)
    Note.add_details(order, shopify_order.note_attributes)
    Discount.add_details(order, shopify_order.discount_codes)
    billing_address = shopify_order.billing_address rescue nil
    if !billing_address.blank?
      BillingAddress.add_details(order, billing_address)
    end
    shipping_address = shopify_order.shipping_address rescue nil
    if !shipping_address.blank?
      ShippingAddress.add_details(order, shipping_address)
    end
    payment_details = shopify_order.payment_details rescue nil
    if !payment_details.blank?
      PaymentDetail.add_details(order, payment_details)
    end
  end
end

  def update_details(order,shopify_order,shop)
    order.update_attributes(
      :order_name => shopify_order.name,
      :order_created_at => shopify_order.created_at,
      :order_updated_at => shopify_order.updated_at,
      :status => Order.get_status(shopify_order),
      :payment_status => shopify_order.financial_status,
      :fulfillment_status => Order.get_fulfillment_status(shopify_order),
      :payment_method => shopify_order.processing_method,
      :gateway => shopify_order.gateway,
      :currency => shopify_order.currency,
      :subtotal_price => shopify_order.subtotal_price,
      :subtotal_tax => shopify_order.total_tax,
      :total_discounts => shopify_order.total_discounts,
      :total_line_items_price => shopify_order.total_line_items_price,
      :total_price => shopify_order.total_price,
      :total_tax => shopify_order.total_tax,
      :total_weight => shopify_order.total_weight,
      :taxes_included => shopify_order.taxes_included,
      :shop_id => shop.id,
      :email => shopify_order.email,
      :order_note => shopify_order.note
    )
  end

So as you can see, we are looping through each order, finding out if it exists or not (then either loading the existing Order or creating the new Order), and then calling update_attributes to pass in the details for the Order. After that we create or update each of the associations. Each associated model looks very similar to this:

  class << self
    def add_details(order, tax_lines)
      tax_lines.each do |shopify_tax_line|
        taxline = Taxline.find_or_create_by_order_id(:order_id => order.id)
        taxline.update_details(shopify_tax_line)
      end
    end
  end
  def update_details(tax_line)
    self.update_attributes(:price => tax_line.price, :rate => tax_line.rate, :title => tax_line.title)
  end

I've looked into the activerecord-import gem but unfortunately it seems to be more geared towards creation of records in bulk and not update as we also require.

What is the best way that this can be improved for performance?

Many many thanks in advance.

UPDATE:

I came up with this slight improvement, which essentialy removes the call to update the newly created Orders (one query less per order).

 def add_details(shop, shopify_orders)
      shopify_orders.each do |shopify_order|
      values = {:order_id => shopify_order.id.to_s, :shop_id => shop.id,
        :order_name => shopify_order.name,
            :order_created_at => shopify_order.created_at,
            :order_updated_at => shopify_order.updated_at,
            :status => Order.get_status(shopify_order),
            :payment_status => shopify_order.financial_status,
            :fulfillment_status => Order.get_fulfillment_status(shopify_order),
            :payment_method => shopify_order.processing_method,
            :gateway => shopify_order.gateway,
            :currency => shopify_order.currency,
            :subtotal_price => shopify_order.subtotal_price,
            :subtotal_tax => shopify_order.total_tax,
            :total_discounts => shopify_order.total_discounts,
            :total_line_items_price => shopify_order.total_line_items_price,
            :total_price => shopify_order.total_price,
            :total_tax => shopify_order.total_tax,
            :total_weight => shopify_order.total_weight,
            :taxes_included => shopify_order.taxes_included,
            :email => shopify_order.email,
            :order_note => shopify_order.note}
        get_order = Order.where(:order_id => shopify_order.id.to_s, :shop_id => shop.id)
        if get_order.blank?
            order = Order.create(values)
        else
        order = get_order.first  
            order.update_attributes(values)
        end
        ShippingLine.add_details(order, shopify_order.shipping_lines)
        LineItem.add_details(order, shopify_order.line_items)
        Taxline.add_details(order, shopify_order.tax_lines)
        Fulfillment.add_details(order, shopify_order.fulfillments)
        Note.add_details(order, shopify_order.note_attributes)
        Discount.add_details(order, shopify_order.discount_codes)
        billing_address = shopify_order.billing_address rescue nil
        if !billing_address.blank?
          BillingAddress.add_details(order, billing_address)
        end
        shipping_address = shopify_order.shipping_address rescue nil
        if !shipping_address.blank?
          ShippingAddress.add_details(order, shipping_address)
        end
        payment_details = shopify_order.payment_details rescue nil
        if !payment_details.blank?
          PaymentDetail.add_details(order, payment_details)
        end
      end
 end

and for the associated objects:

  class << self
    def add_details(order, tax_lines)
      tax_lines.each do |shopify_tax_line|
        values = {:order_id => order.id,
            :price => tax_line.price,
            :rate => tax_line.rate,
            :title => tax_line.title}
        get_taxline = Taxline.where(:order_id => order.id)
        if get_taxline.blank?
            taxline = Taxline.create(values)
        else
            taxline = get_taxline.first  
            taxline.update_attributes(values)
        end
      end
    end
  end

Any better suggestions?

Bjorn Forsberg
  • 470
  • 1
  • 6
  • 19
  • You handle a lot of data, it's pretty normal it's slow. What's the context of this code ? Is it in an api ? Is it in a web request ? Is it blocking code ? Do you need the data to be available very fast or can you push some jobs in a queue and forget about it ? – Intrepidd Sep 25 '13 at 08:46
  • Hej @Intrepidd. We are consuming an API, and then running this in a background job, while the front-end checks via AJAX calls to see when it's done. So basically, first we grab the orders via API, then loop through them to place in DB. When the user first installs our app is when we do most of the importing, and then the user waits for the job to be done. After that it's more to keep it updated and grab any new orders. Any help on making it faster is appreciated. – Bjorn Forsberg Sep 25 '13 at 08:53
  • why don't you use get_order = Order.find_or_create_by instead of conditionals? – Fenec Sep 25 '13 at 09:21
  • @Fenec Good point. I was actually considering using `get_order = Order.where(:order_id => shopify_order.id.to_s, :shop_id => shop.id).first_or_create` instead. It will make the code prettier, but not sure if it will speed things up though. Updated code example above to reflect this. – Bjorn Forsberg Sep 25 '13 at 09:31
  • Have you tried update_all: http://apidock.com/rails/ActiveRecord/Base/update_all/class – Betjamin Richards Sep 25 '13 at 10:10
  • If information you provided is correct it means there are around 600 queries/second executed which I would call pretty robust. Also, there's nothing special that updating "500 records" "plus approx. 10-15 associated objects" generates 6500+ queries. That's just maths! (1+ 10..15) * 500 = 5500..8000 – Mike Szyndel Sep 25 '13 at 10:54
  • @BetjaminRichards Thanks, but update_all pushes the same value to all of the records, which is not what we are after as the values may be different for each order. – Bjorn Forsberg Sep 25 '13 at 11:08
  • Ok, thanks. Agree with the maths, was just hoping there was some operation which could combine many of the individual update/insert queries into a single query thereby reducing the overall operations? – Bjorn Forsberg Sep 25 '13 at 12:23

3 Answers3

7

Try wrapping your entire code into a single database transaction. Since you're on Heroku it'll be a Postgres bottom-end. With that many update statements, you can probably benefit greatly by transacting them all at once, so your code executes quicker and basically just leaves a "queue" of 6500 statements to run on Postgres side as the server is able to dequeue them. Depending on the bottom end, you might have to transact into smaller chunks - but even transacting 100 at a time (and then close and re-open the transaction) would greatly improve throughput into Pg.

http://api.rubyonrails.org/classes/ActiveRecord/Transactions/ClassMethods.html http://www.postgresql.org/docs/9.2/static/sql-set-transaction.html

So before line 2 you'd add something like:

def add_details(shop, shopify_orders)
  Order.transaction do
    shopify_orders.each do |shopify_order|

And then at the very end of your method add another end:

      if !payment_details.blank?
        PaymentDetail.add_details(order, payment_details)
      end
    end //shopify_orders.each..
  end //Order.transaction..
end //method
Steve Midgley
  • 2,226
  • 2
  • 18
  • 20
  • That's an excellent answer, just the type of thing I was looking for! I'll test over the weekend and update once I've given it a go. We actually already pass in 250 Orders at a time to this, so I can easily adjust that to pass in 100 at a time to limit the size of the transaction. If that is still too much I can just move the `Order.transaction do` line below the `shopify_orders.each` loop so at least it will only do 1 transaction per Order (reducing it from 6500 transactions to 500 trasactions total). – Bjorn Forsberg Sep 27 '13 at 06:55
  • Doesn't look like I'll be hitting the limit of transaction size :) http://stackoverflow.com/questions/709708/maximum-transaction-size-in-postgresql – Bjorn Forsberg Sep 27 '13 at 07:21
  • 1
    Thanks, this worked great. Using your suggested technique combined with the other changes I outlined above in my question, reduced the processing time by 50% !! :) – Bjorn Forsberg Oct 01 '13 at 21:21
1

You can monkey-patch ActiveRecord like this:

class ActiveRecord::Base

  #http://stackoverflow.com/questions/15317837/bulk-insert-records-into-active-record-table?lq=1
  #https://gist.github.com/jackrg/76ade1724bd816292e4e
  #  "UPDATE THIS SET <list_of_column_assignments>  FROM <table_name> THIS  JOIN (VALUES (<csv1>, <csv2>,...) VALS ( <column_names> ) ON <list_of_primary_keys_comparison>"
  def self.bulk_update(record_list)
      pk = self.primary_key
      raise "primary_key not found" unless pk.present?

      raise "record_list not an Array of Hashes" unless record_list.is_a?(Array) && record_list.all? {|rec| rec.is_a? Hash }
      return nil if record_list.empty?

      result = nil

      #test if every hash has primary keys, so we can JOIN
      record_list.each { |r|  raise "Primary Keys '#{self.primary_key.to_s}' not found on record: #{r}" unless hasAllPKs?(r) }


      #list of primary keys comparison
      pk_comparison_array = []
      if (pk).is_a?(Array)
          pk.each {|thiskey| pk_comparison_array << "THIS.#{thiskey} = VALS.#{thiskey}" }
      else
          pk_comparison_array << "THIS.#{pk} = VALS.#{pk}"
      end
      pk_comparison = pk_comparison_array.join(' AND ')

      #SQL
      (1..record_list.count).step(1000).each do |start|
        key_list, value_list = convert_record_list(record_list[start-1..start+999])
        #csv values
        csv_vals = value_list.map {|v| "(#{v.join(", ")})" }.join(", ")
        #column names
        column_names = key_list.join(", ")
        #list of columns assignments
        columns_assign_array = []
        key_list.each {|col|
          unless inPK?(col)
            columns_assign_array << "THIS.#{col} = VALS.#{col}"
          end }
        columns_assign = columns_assign_array.join(', ')

        sql = "UPDATE THIS SET #{columns_assign}  FROM #{self.table_name} THIS  JOIN ( VALUES #{csv_vals} ) VALS ( #{column_names} ) ON ( #{pk_comparison} )"
        result = self.connection.execute(sql)

        return result if result<0
      end

      return result

  end

  def self.inPK?(str)
      pk = self.primary_key

      test = str.to_s
      if pk.is_a?(Array)
            (pk.include?(test))
      else
            (pk==test)
      end
  end

  #test if given hash has primary keys included as hash keys and those keys are not empty
  def self.hasAllPKs?(hash)
      h = hash.stringify_keys
      pk = self.primary_key

      if pk.is_a?(Array)
           (pk.all? {|k| h.key?(k) and h[k].present? })
      else
           h.key?(pk) and h[pk].present?
      end
  end

  def self.convert_record_list(record_list)
    # Build the list of keys
    key_list = record_list.map(&:keys).flatten.map(&:to_s).uniq.sort

    value_list = record_list.map do |rec|
      list = []
      key_list.each {|key| list <<  ActiveRecord::Base.connection.quote(rec[key] || rec[key.to_sym]) }
      list
    end

    # If table has standard timestamps and they're not in the record list then add them to the record list
    time = ActiveRecord::Base.connection.quote(Time.now)
    for field_name in %w(created_at updated_at)
      if self.column_names.include?(field_name) && !(key_list.include?(field_name))
        key_list << field_name
        value_list.each {|rec| rec << time }
      end
    end

    return [key_list, value_list]
  end
end

Then, you can generate a array of hashes containing your models attributes (including theirs primary keys) and do something like:

ActiveRecord::Base.transaction do
   Model.bulk_update [ {attr1: val1, attr2: val2,...},  {attr1: val1, attr2: val2,...},   ... ]
end

It will be a single SQL command without Rails callbacks and validations.

Fernando Fabreti
  • 4,277
  • 3
  • 32
  • 33
  • That sounds great, but when I try it, I get `ActiveRecord::StatementInvalid: PG::Error: ERROR: relation "this" does not exist`. Command looks like `UPDATE this SET this."best_ranking" = vals."best_ranking",this."updated_at" = vals."updated_at" FROM books this JOIN (VALUES ...) vals ("best_ranking", "id", "created_at", "updated_at") on this.id = vals.id`. Any thoughts about why this might happen (running on Postgres 9.1)? – Jack R-G Nov 19 '14 at 00:38
  • Sorry to hear that, in fact, it's my mistake. The code above works with SQLServer and MySQL. I did not see the original question was about PostGreSQL. Trying to help, I've looked at PostGreSQL UPDATE Syntax (http://www.postgresql.org/docs/9.1/static/sql-update.html) and maybe you could try to modify the line that assigns the sql variable to this: `sql = "UPDATE table #{self.table_name} as THIS SET #{columns_assign} FROM #{self.table_name} JOIN ( VALUES #{csv_vals} ) as VALS ( #{column_names} ) ON ( #{pk_comparison} )"` beware I have NOT tested it! Good luck. – Fernando Fabreti Nov 19 '14 at 16:18
  • You can also try to run the above query inside PGAdmin or thru CLI, so that you cant modify the syntax until it works. Let us know of the progress so that we can update the answer and share with others! – Fernando Fabreti Nov 19 '14 at 16:21
  • Thanks for the suggestion. I was able to get a variation of what you suggested to work. sql = "UPDATE #{self.table_name} AS this SET #{columns_assign} FROM (VALUES #{cvs_vals}) AS vals (#{column_names}) where #{pk_comparison}". I had tried something similar to that but did not use the "AS" keyword, which seems to be required in this case although the syntax definition shows it as optional. The important point here is that you cannot repeat the table being updated in the FROM phrase; therefore you cannot do a join and you must put the join condition in a WHERE phrase. – Jack R-G Nov 20 '14 at 18:43
  • Another issue is that there can be type issues when using FROM (VALUES ...). If, for example, one of the fields is a timestamp (e.g. updated_at), you cannot simply use a string value in the VALS array as you would in a simple UPDATE table SET updated_at = '12/14/2013', you must cast the string into a timestamp (UPDATE table SET updated_at = CAST(vals.updated_at AS TIMESTAMP) ... – Jack R-G Nov 20 '14 at 19:05
  • Ok! Maybe you could share an updated sql string to reflect the join problem, I'll update the answer. Or simply add a new answer. – Fernando Fabreti Nov 21 '14 at 16:03
0

For PostgreSQL, there are several issues that the above approach does not address:

  1. You must specify an actual table, not just an alias, in the update target table.
  2. You cannot repeat the target table in the FROM phrase. Since you are joining the target table to a VALUES table (hence there is only one table in the FROM phrase, you won't be able to use JOIN, you must instead use "WHERE ".
  3. You don't get the same "free" casts in a VALUES table that you do in a simple "UPDATE" command, so you must cast date/timestamp values as such (#val_cast does this).

    class ActiveRecord::Base
    
      def self.update!(record_list)
        raise ArgumentError "record_list not an Array of Hashes" unless record_list.is_a?(Array) && record_list.all? {|rec| rec.is_a? Hash }
        return record_list if record_list.empty?
    
        (1..record_list.count).step(1000).each do |start|
          field_list, value_list = convert_record_list(record_list[start-1..start+999])
          key_field = self.primary_key
          non_key_fields = field_list - [%Q["#{self.primary_key}"], %Q["created_at"]]
          columns_assign = non_key_fields.map {|field| "#{field} = #{val_cast(field)}"}.join(",")
          value_table = value_list.map {|row| "(#{row.join(", ")})" }.join(", ")
          sql = "UPDATE #{table_name} AS this SET #{columns_assign} FROM (VALUES #{value_table}) vals (#{field_list.join(", ")}) WHERE this.#{key_field} = vals.#{key_field}"
          self.connection.update_sql(sql)
        end
    
        return record_list
      end
    
      def self.val_cast(field)
        field = field.gsub('"', '')
        if (column = columns.find{|c| c.name == field }).sql_type =~ /time|date/
          "cast (vals.#{field} as #{column.sql_type})"
        else
          "vals.#{field}"
        end
      end
    
      def self.convert_record_list(record_list)
        # Build the list of fields
        field_list = record_list.map(&:keys).flatten.map(&:to_s).uniq.sort
    
        value_list = record_list.map do |rec|
          list = []
          field_list.each {|field| list <<  ActiveRecord::Base.connection.quote(rec[field] || rec[field.to_sym]) }
          list
        end
    
        # If table has standard timestamps and they're not in the record list then add them to the record list
        time = ActiveRecord::Base.connection.quote(Time.now)
        for field_name in %w(created_at updated_at)
          if self.column_names.include?(field_name) && !(field_list.include?(field_name))
            field_list << field_name
            value_list.each {|rec| rec << time }
          end
        end
    
        field_list.map! {|field| %Q["#{field}"] }
    
        return [field_list, value_list]
      end
    end
    
Jack R-G
  • 1,778
  • 2
  • 19
  • 25