Source code for pydna.py_rstr_max.rstr_max

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from tools_karkkainen_sanders import direct_kark_sort
from array import array

[docs]class Rstr_max : def __init__(self) : self.array_str = []
[docs] def add_str(self, str_unicode) : self.array_str.append(str_unicode)
[docs] def step1_sort_suffix(self) : char_frontier = chr(2) self.global_suffix = char_frontier.join(self.array_str) nbChars = len(self.global_suffix) init = [-1]*nbChars self.idxString = array('i', init) self.idxPos = array('i', init) self.endAt = array('i', init) k = idx = 0 for mot in self.array_str : last = k + len(mot) for p in xrange(len(mot)) : self.idxString[k] = idx self.idxPos[k] = p self.endAt[k] = last k += 1 idx += 1 k += 1 self.res = direct_kark_sort(self.global_suffix)
[docs] def step2_lcp(self) : n = len(self.res) init = [0]*n rank = array('i', init) LCP = array('i', init) s = self.global_suffix suffix_array = self.res endAt = self.endAt for i in xrange(len(self.array_str),n): v = self.res[i] rank[v] = i l = 0 for j in xrange(n): if(l > 0) : l -= 1 i = rank[j] j2 = suffix_array[i-1] if i: while l + j < endAt[j] and l + j2 < endAt[j2] and s[j+l] == s[j2+l]: l += 1 LCP[i-1] = l else: l = 0 self.lcp = LCP
[docs] def step3_rstr(self) : prev_len = 0 idx = 0 results = {} len_lcp = len(self.lcp) -1 # lcp = self.lcp # res = self.res class Stack: pass stack = Stack() stack._top = 0 stack.lst_max = [] if len(self.res) == 0 : return {} pos1 = self.res[0] #offset1 = self.idxPos[self.res[0]] #idStr1 = self.idxString[self.res[0]] for idx in xrange(len_lcp): current_len = self.lcp[idx] pos2 = self.res[idx+1] #offset2 = self.idxPos[pos2] #idStr2 = self.idxString[pos2] #offset2, idStr2 = self.array_suffix[idx+1] end_ = max(pos1, pos2) + current_len# max(pos1, pos2) + current_len # e = max((idStr1, offset1), (idStr2, offset2)) # end_ = (e[0],e[1]+current_len) n = prev_len - current_len if n < 0 : #pushMany stack.lst_max.append([-n, idx, end_]) stack._top += -n elif n > 0: self.removeMany(stack, results, n, idx) elif stack._top > 0 and end_ > stack.lst_max[-1][-1] : #setMax stack.lst_max[-1][-1] = end_ prev_len = current_len pos1 = pos2 #offset1 = offset2 #idStr1 = idStr2 if(stack._top > 0) : self.removeMany(stack, results, stack._top, idx+1) return results
[docs] def removeMany(self, stack, results, m, idxEnd): prevStart = -1 while m > 0: n, idxStart, maxEnd = stack.lst_max.pop() if prevStart != idxStart: #idStr = self.idxString[maxEnd-1] #pos = self.idxPos[maxEnd-1] id_ = (maxEnd, idxEnd-idxStart+1) if id_ not in results or results[id_][0] < stack._top: results[id_] = (stack._top,idxStart) prevStart = idxStart m -= n stack._top -= n if m < 0: stack.lst_max.append([-m, idxStart, maxEnd-n-m]) stack._top -= m
[docs] def go(self) : # import time # t_start = t0 = time.time() # t0 = time.time() self.step1_sort_suffix() # print time.time() - t0 # t0 = time.time() self.step2_lcp() # print time.time() - t0 # t0 = time.time() r = self.step3_rstr() # print time.time() - t0 # print time.time() - t_start return r
if (__name__ == '__main__') : str1 = 'toto' str1_unicode = unicode(str1,'utf-8','replace') rstr = Rstr_max() rstr.add_str(str1_unicode) rstr.add_str(str1_unicode) r = rstr.go() for ((id_str, end), nb), (l, start_plage) in r.iteritems(): ss = rstr.array_str[id_str][end-l:end] print '[%s] %d'%(ss.encode('utf-8'), nb)