Huh...actually, the hash-based one seems faster than the Set-based
one:
And faster still, by a hair, is a last-in approach. Upon reflection,
all these techniques rely only on methods already in Enumerable, so
they can be put there instead of being Array-specific.
module Enumerable
require 'set'
def uniq_by1
seen = Set.new
select{ |x| seen.add?( yield( x ) ) }
end
def uniq_by2
seen = {}
select{ |x| !seen[v=yield(x)] && (seen[v]=true) }
end
def uniq_by3
Hash[ *map{ |x| [ yield(x), x ] }.flatten ].values
end
def uniq_by4
# fastest, preserves last-seen value for a key
h = {}
each{ |x| h[yield(x)] = x }
h.values
end
def uniq_by5
# near-fastest, preserves first-seen value for a key
h = {}
each{ |x| v=yield(x); h[v]=x unless h.include?(v) }
h.values
end
end
a = [ {:a=>1, :d=>1}, {:b=>2}, {:c=>3}, {:a=>1, :d=>3},
{:a=>2, :e=>7}, {:a=>3, :b=>2}, {:a=>1}, {:a=>4}, {:f=>6} ]
require 'benchmark'
N = 20_000
Benchmark.bmbm{ |x|
x.report( 'with set' ){
N.times{
a.uniq_by1{ |h| h[:a] }
a.uniq_by1{ |h| h[:b] }
}
}
x.report( 'with hash' ){
N.times{
a.uniq_by2{ |h| h[:a] }
a.uniq_by2{ |h| h[:b] }
}
}
x.report( 'Hash.[].values' ){
N.times{
a.uniq_by3{ |h| h[:a] }
a.uniq_by3{ |h| h[:b] }
}
}
x.report( '#values (last in)' ){
N.times{
a.uniq_by4{ |h| h[:a] }
a.uniq_by4{ |h| h[:b] }
}
}
x.report( '#values (first in)' ){
N.times{
a.uniq_by5{ |h| h[:a] }
a.uniq_by5{ |h| h[:b] }
}
}
}
#=> Rehearsal ------------------------------------------------------
#=> with set 2.500000 0.016000 2.516000 ( 2.547000)
#=> with hash 1.312000 0.000000 1.312000 ( 1.313000)
#=> Hash.[].values 2.453000 0.000000 2.453000 ( 2.453000)
#=> #values (last in) 1.110000 0.000000 1.110000 ( 1.109000)
#=> #values (first in) 1.296000 0.000000 1.296000 ( 1.297000)
#=> --------------------------------------------- total: 8.687000sec
#=>
#=> user system total real
#=> with set 2.000000 0.000000 2.000000 ( 1.999000)
#=> with hash 1.297000 0.000000 1.297000 ( 1.297000)
#=> Hash.[].values 2.531000 0.000000 2.531000 ( 2.532000)
#=> #values (last in) 1.125000 0.015000 1.140000 ( 1.140000)
#=> #values (first in) 1.344000 0.000000 1.344000 ( 1.344000)