M

#### Martin Hansen

The below code is too slow for practical use. I need it to run at least

20 times faster. Perhaps that is possible with some C code? I have no

experience with writing Ruby extensions. What are the pitfalls? Which

part of the code should be ported? Any pointers to get me started?

Cheers,

Martin

#!/usr/bin/env ruby

# IUPAC nucleotide pair ambiguity equivalents are saved in an

# array of bit fields.

BIT_A = 1 << 0

BIT_T = 1 << 1

BIT_C = 1 << 2

BIT_G = 1 << 3

EQUAL = Array.new(256, 0)

EQUAL['A'.ord] = BIT_A

EQUAL['T'.ord] = BIT_T

EQUAL['U'.ord] = BIT_T

EQUAL['C'.ord] = BIT_C

EQUAL['G'.ord] = BIT_G

EQUAL['M'.ord] = (BIT_A|BIT_C)

EQUAL['R'.ord] = (BIT_A|BIT_G)

EQUAL['W'.ord] = (BIT_A|BIT_T)

EQUAL['S'.ord] = (BIT_C|BIT_G)

EQUAL['Y'.ord] = (BIT_C|BIT_T)

EQUAL['K'.ord] = (BIT_G|BIT_T)

EQUAL['B'.ord] = (BIT_C|BIT_G|BIT_T)

EQUAL['D'.ord] = (BIT_A|BIT_G|BIT_T)

EQUAL['H'.ord] = (BIT_A|BIT_C|BIT_T)

EQUAL['V'.ord] = (BIT_A|BIT_C|BIT_G)

EQUAL['N'.ord] = (BIT_A|BIT_C|BIT_G|BIT_T)

# Module containing code to locate nucleotide patterns in sequences

allowing for

# ambiguity codes and a given maximum edit distance.

# Insertions are nucleotides found in the pattern but not in the

sequence.

# Deletions are nucleotides found in the sequence but not in the

pattern.

#

# Inspired by the paper by Bruno Woltzenlogel Paleo (page 197):

# http://www.logic.at/people/bruno/Papers/2007-GATE-ESSLLI.pdf

module PatternMatcher

#

------------------------------------------------------------------------------

# str.match(pattern[, pos[, max_edit_distance]])

# -> Match or nil

#

#

------------------------------------------------------------------------------

# Method to locate the next pattern match starting from a given

position. A match

# is allowed to contain a given maximum edit distance. If a match is

located a

# Match object will be returned otherwise nil.

def match(pattern, pos = 0, max_edit_distance = 0)

@pattern = pattern

@pos = pos

@max_edit_distance = max_edit_distance

@vector = vector_init

while @pos < @seq.length

vector_update

return match_new if match_found?

@pos += 1

end

end

#

------------------------------------------------------------------------------

# str.scan(pattern[, pos[, max_edit_distance]])

# -> Array

# str.scan(pattern[, pos[, max_edit_distance]]) { |match|

# block

# }

# -> Match

#

#

------------------------------------------------------------------------------

# Method to iterate through a sequence to locate pattern matches

starting

# from a given position and allowing for a maximum edit distance.

# Matches found in block context return the Match object. Otherwise

matches are

# returned in an Array.

def scan(pattern, pos = 0, max_edit_distance = 0)

matches = []

while match = match(pattern, pos, max_edit_distance)

if block_given?

yield match

else

matches << match

end

pos = match.pos + 1

end

return matches unless block_given?

end

private

# Method to initailize the score vector and return this.

def vector_init

vector = []

(0 ... @pattern.length + 1).each do |i|

vector

*= Score.new(matches = 0, mismatches = 0, insertions = i)*

end

vector

end

# Method to update the score vector.

def vector_update

new_vector = @vector.dup

(0 ... @pattern.length).each do |i|

if match?(@seq[@pos], @pattern

end

vector

end

# Method to update the score vector.

def vector_update

new_vector = @vector.dup

(0 ... @pattern.length).each do |i|

if match?(@seq[@pos], @pattern

*)*

new_vector[i + 1] = @vectornew_vector[i + 1] = @vector

*.dup*

new_vector[i + 1].matches += 1

else

mismatch = @vectornew_vector[i + 1].matches += 1

else

mismatch = @vector

*.dup*

insertion = new_vectorinsertion = new_vector

*.dup*

deletion = @vector[i + 1].dup

if deletion?(mismatch, insertion, deletion)

deletion.deletions += 1

new_vector[i + 1] = deletion

elsif mismatch?(mismatch, insertion, deletion)

mismatch.mismatches += 1

new_vector[i + 1] = mismatch

elsif insertion?(mismatch, insertion, deletion)

insertion.insertions += 1

new_vector[i + 1] = insertion

end

end

end

@vector = new_vector

end

# Method to determine if a match occurred.

def match?(char1, char2)

(EQUAL[char1.upcase.ord] & EQUAL[char2.upcase.ord]) != 0

end

# Method to determine if a mismatch occured.

def mismatch?(mismatch, insertion, deletion)

if mismatch.edit_distance <= insertion.edit_distance and

mismatch.edit_distance <= deletion.edit_distance

true

end

end

# Method to determine if an insertion occured.

def insertion?(mismatch, insertion, deletion)

if insertion.edit_distance <= mismatch.edit_distance and

insertion.edit_distance <= deletion.edit_distance

true

end

end

# Method to determine if a deletion occured.

def deletion?(mismatch, insertion, deletion)

if deletion.edit_distance <= mismatch.edit_distance and

deletion.edit_distance <= insertion.edit_distance

true

end

end

# Method to print the score vector.

def vector_print

@vector.each do |s|

puts s

end

puts

end

# Method that returns a Match object initialized with

# information from the score vector.

def match_new

matches = @vector.last.matches

mismatches = @vector.last.mismatches

insertions = @vector.last.insertions

deletions = @vector.last.deletions

length = @pattern.length - insertions + deletions

pos = @pos - length + 1

match = @seq[pos ... pos + length]

Match.new(pos, match, matches, mismatches, insertions, deletions,

length)

end

# Method that determines if a match was found by analyzing the score

vector.

def match_found?

if @vector.last.edit_distance <= @max_edit_distance

true

end

end

# Class to instantiate Score objects that holds score information.

class Score

attr_accessor :matches, :mismatches, :insertions, :deletions

def initialize(matches = 0, mismatches = 0, insertions = 0,

deletions = 0)

@matches = matches

@mismatches = mismatches

@insertions = insertions

@deletions = deletions

end

# Method to calculate and return the edit distance.

def edit_distance

self.mismatches + self.insertions + self.deletions

end

private

def to_s

"(#{[self.matches, self.mismatches, self.insertions,

self.deletions].join(',')})"

end

end

# Class for creating Match objects which contain the description of a

# match between a nucleotide sequence and a pattern.

class Match

attr_reader os, :match, :matches, :mismatches, :insertions,

:deletions, :edit_distance, :length

def initialize(pos, match, matches, mismatches, insertions,

deletions, length)

@pos = pos

@match = match

@matches = matches

@mismatches = mismatches

@insertions = insertions

@deletions = deletions

@edit_distance = mismatches + insertions + deletions

@length = length

end

end

enddeletion = @vector[i + 1].dup

if deletion?(mismatch, insertion, deletion)

deletion.deletions += 1

new_vector[i + 1] = deletion

elsif mismatch?(mismatch, insertion, deletion)

mismatch.mismatches += 1

new_vector[i + 1] = mismatch

elsif insertion?(mismatch, insertion, deletion)

insertion.insertions += 1

new_vector[i + 1] = insertion

end

end

end

@vector = new_vector

end

# Method to determine if a match occurred.

def match?(char1, char2)

(EQUAL[char1.upcase.ord] & EQUAL[char2.upcase.ord]) != 0

end

# Method to determine if a mismatch occured.

def mismatch?(mismatch, insertion, deletion)

if mismatch.edit_distance <= insertion.edit_distance and

mismatch.edit_distance <= deletion.edit_distance

true

end

end

# Method to determine if an insertion occured.

def insertion?(mismatch, insertion, deletion)

if insertion.edit_distance <= mismatch.edit_distance and

insertion.edit_distance <= deletion.edit_distance

true

end

end

# Method to determine if a deletion occured.

def deletion?(mismatch, insertion, deletion)

if deletion.edit_distance <= mismatch.edit_distance and

deletion.edit_distance <= insertion.edit_distance

true

end

end

# Method to print the score vector.

def vector_print

@vector.each do |s|

puts s

end

puts

end

# Method that returns a Match object initialized with

# information from the score vector.

def match_new

matches = @vector.last.matches

mismatches = @vector.last.mismatches

insertions = @vector.last.insertions

deletions = @vector.last.deletions

length = @pattern.length - insertions + deletions

pos = @pos - length + 1

match = @seq[pos ... pos + length]

Match.new(pos, match, matches, mismatches, insertions, deletions,

length)

end

# Method that determines if a match was found by analyzing the score

vector.

def match_found?

if @vector.last.edit_distance <= @max_edit_distance

true

end

end

# Class to instantiate Score objects that holds score information.

class Score

attr_accessor :matches, :mismatches, :insertions, :deletions

def initialize(matches = 0, mismatches = 0, insertions = 0,

deletions = 0)

@matches = matches

@mismatches = mismatches

@insertions = insertions

@deletions = deletions

end

# Method to calculate and return the edit distance.

def edit_distance

self.mismatches + self.insertions + self.deletions

end

private

def to_s

"(#{[self.matches, self.mismatches, self.insertions,

self.deletions].join(',')})"

end

end

# Class for creating Match objects which contain the description of a

# match between a nucleotide sequence and a pattern.

class Match

attr_reader os, :match, :matches, :mismatches, :insertions,

:deletions, :edit_distance, :length

def initialize(pos, match, matches, mismatches, insertions,

deletions, length)

@pos = pos

@match = match

@matches = matches

@mismatches = mismatches

@insertions = insertions

@deletions = deletions

@edit_distance = mismatches + insertions + deletions

@length = length

end

end

end