Using the following benchmark:
def create_genome
"gattaca" * 100
end
def count_frequency_using_chars(sequence)
100000.times do
sequence.chars.group_by{|x| x}.map{|letter, array| [letter, array.count]}
end
end
def count_frequency_using_count(sequence)
100000.times do
["a", "c", "g", "t"].map{|letter| sequence.count(letter)}
end
end
sequence = create_genome
count_frequency_using_chars(sequence)
count_frequency_using_count(sequence)
I found that, in C-based Ruby for both 1.8 and 1.9.2, using String#count(letter)
is approximately 50 times faster than sorting and counting them using Enumerable#group_by
and Array#count
. I was slightly surprised at this, because the String#count
approach reads through the the string four times each iteration, whereas the latter only reads through it once.
I tried running the code under ruby-prof and perftools.rb, and both of them merely indicated that String#chars
took 90% of the time, with no break-down of where that 90% of time was spent.
If I had to guess why there's a difference, I'd say that creating 70 million single-character strings would be expensive, but how would I be able to know? (Update: String#chars
wasn't the culprit - see the benchmark for mainly_execute_a_trivial_block
)
Edit: Current benchmarks using 1.9.2 patchlevel 180:
require 'pp'
require 'benchmark'
def create_genome
"gattaca" * 100
end
ZILLION = 100000
def count_frequency_using_count(sequence)
ZILLION.times do
["a", "c", "g", "t"].map{|letter| sequence.count(letter)}
end
end
def count_frequency_using_chars(sequence)
ZILLION.times do
sequence.chars.group_by{|x| x}.map{|letter, array| [letter, array.count]}
end
end
def count_frequency_using_inject_hash(sequence)
ZILLION.times do
sequence.chars.inject(Hash.new(0)) { |h, e| h[e] += 1 ; h }
end
end
def count_frequency_using_each_with_object(sequence)
ZILLION.times do
sequence.chars.each_with_object(Hash.new(0)) { |char, hash| hash[char] += 1}
end
end
def just_group_by(sequence)
ZILLION.times do
sequence.chars.group_by{|x| x}
end
end
def just_chars_and_trivial_block(sequence)
ZILLION.times do
sequence.chars() {}
end
end
def mainly_execute_a_trivial_block(sequence)
ZILLION.times do
sequence.length.times() {}
end
end
def execute_an_empty_loop_instead(sequence)
ZILLION.times do
i = 0
max = sequence.length
until i == max
i += 1
end
end
end
sequence = create_genome
puts RUBY_VERSION
Benchmark.bm do |benchmark|
benchmark.report do
count_frequency_using_count(sequence)
end
benchmark.report do
count_frequency_using_chars(sequence)
end
benchmark.report do
count_frequency_using_inject_hash(sequence)
end
benchmark.report do
count_frequency_using_each_with_object(sequence)
end
benchmark.report do
just_group_by(sequence)
end
benchmark.report do
just_chars_and_trivial_block(sequence)
end
benchmark.report do
mainly_execute_a_trivial_block(sequence)
end
benchmark.report do
execute_an_empty_for_loop_instead(sequence)
end
end
Results:
user system total real
0.510000 0.000000 0.510000 ( 0.508499) # count_frequency_using_count
23.470000 0.030000 23.500000 ( 23.575487) # count_frequency_using_chars
32.770000 0.050000 32.820000 ( 32.874634) # count_frequency_using_inject_hash
31.880000 0.040000 31.920000 ( 31.942437) # count_frequency_using_each_with_object
22.950000 0.030000 22.980000 ( 22.970500) # just_group_by
13.300000 0.020000 13.320000 ( 13.314466) # just_chars_and_trivial_block
5.660000 0.000000 5.660000 ( 5.661516) # mainly_execute_a_trivial_block
1.930000 0.010000 1.940000 ( 1.934861) # execute_an_empty_loop_instead