Package pyarabic ::
Module araby
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 """
19 Arabic module
20 @author: Taha Zerrouki
21 @contact: taha dot zerrouki at gmail dot com
22 @copyright: Arabtechies, Arabeyes, Taha Zerrouki
23 @license: GPL
24 @date:2010/03/01
25 @version: 0.1
26 """
27 import re
28 from stack import *
29
30 """
31 the arabic chars contains all arabic letters, a sub class of unicode,
32 """
33
34 COMMA = u'\u060C'
35 SEMICOLON = u'\u061B'
36 QUESTION = u'\u061F'
37 HAMZA = u'\u0621'
38 ALEF_MADDA = u'\u0622'
39 ALEF_HAMZA_ABOVE = u'\u0623'
40 WAW_HAMZA = u'\u0624'
41 ALEF_HAMZA_BELOW = u'\u0625'
42 YEH_HAMZA = u'\u0626'
43 ALEF = u'\u0627'
44 BEH = u'\u0628'
45 TEH_MARBUTA = u'\u0629'
46 TEH = u'\u062a'
47 THEH = u'\u062b'
48 JEEM = u'\u062c'
49 HAH = u'\u062d'
50 KHAH = u'\u062e'
51 DAL = u'\u062f'
52 THAL = u'\u0630'
53 REH = u'\u0631'
54 ZAIN = u'\u0632'
55 SEEN = u'\u0633'
56 SHEEN = u'\u0634'
57 SAD = u'\u0635'
58 DAD = u'\u0636'
59 TAH = u'\u0637'
60 ZAH = u'\u0638'
61 AIN = u'\u0639'
62 GHAIN = u'\u063a'
63 TATWEEL = u'\u0640'
64 FEH = u'\u0641'
65 QAF = u'\u0642'
66 KAF = u'\u0643'
67 LAM = u'\u0644'
68 MEEM = u'\u0645'
69 NOON = u'\u0646'
70 HEH = u'\u0647'
71 WAW = u'\u0648'
72 ALEF_MAKSURA = u'\u0649'
73 YEH = u'\u064a'
74 MADDA_ABOVE = u'\u0653'
75 HAMZA_ABOVE = u'\u0654'
76 HAMZA_BELOW = u'\u0655'
77 ZERO = u'\u0660'
78 ONE = u'\u0661'
79 TWO = u'\u0662'
80 THREE = u'\u0663'
81 FOUR = u'\u0664'
82 FIVE = u'\u0665'
83 SIX = u'\u0666'
84 SEVEN = u'\u0667'
85 EIGHT = u'\u0668'
86 NINE = u'\u0669'
87 PERCENT = u'\u066a'
88 DECIMAL = u'\u066b'
89 THOUSANDS = u'\u066c'
90 STAR = u'\u066d'
91 MINI_ALEF = u'\u0670'
92 ALEF_WASLA = u'\u0671'
93 FULL_STOP = u'\u06d4'
94 BYTE_ORDER_MARK = u'\ufeff'
95
96
97 FATHATAN = u'\u064b'
98 DAMMATAN = u'\u064c'
99 KASRATAN = u'\u064d'
100 FATHA = u'\u064e'
101 DAMMA = u'\u064f'
102 KASRA = u'\u0650'
103 SHADDA = u'\u0651'
104 SUKUN = u'\u0652'
105
106
107 SMALL_ALEF =u"\u0670"
108 SMALL_WAW =u"\u06E5"
109 SMALL_YEH =u"\u06E6"
110
111 LAM_ALEF =u'\ufefb'
112 LAM_ALEF_HAMZA_ABOVE =u'\ufef7'
113 LAM_ALEF_HAMZA_BELOW =u'\ufef9'
114 LAM_ALEF_MADDA_ABOVE =u'\ufef5'
115 simple_LAM_ALEF =u'\u0644\u0627'
116 simple_LAM_ALEF_HAMZA_ABOVE =u'\u0644\u0623'
117 simple_LAM_ALEF_HAMZA_BELOW =u'\u0644\u0625'
118 simple_LAM_ALEF_MADDA_ABOVE =u'\u0644\u0622'
119
120 LETTERS=u''.join([
121 ALEF , BEH , TEH , TEH_MARBUTA , THEH , JEEM , HAH , KHAH ,
122 DAL , THAL , REH , ZAIN , SEEN , SHEEN , SAD , DAD , TAH , ZAH ,
123 AIN , GHAIN , FEH , QAF , KAF , LAM , MEEM , NOON, HEH , WAW, YEH ,
124 HAMZA , ALEF_MADDA , ALEF_HAMZA_ABOVE , WAW_HAMZA , ALEF_HAMZA_BELOW , YEH_HAMZA ,
125 ])
126
127 TASHKEEL =(FATHATAN, DAMMATAN, KASRATAN,
128 FATHA,DAMMA,KASRA,
129 SUKUN,
130 SHADDA);
131 HARAKAT =( FATHATAN, DAMMATAN, KASRATAN,
132 FATHA, DAMMA, KASRA,
133 SUKUN
134 );
135 SHORTHARAKAT =( FATHA, DAMMA, KASRA, SUKUN);
136
137 TANWIN =(FATHATAN, DAMMATAN, KASRATAN);
138
139
140 LIGUATURES=(
141 LAM_ALEF,
142 LAM_ALEF_HAMZA_ABOVE,
143 LAM_ALEF_HAMZA_BELOW,
144 LAM_ALEF_MADDA_ABOVE,
145 );
146 HAMZAT=(
147 HAMZA,
148 WAW_HAMZA,
149 YEH_HAMZA,
150 HAMZA_ABOVE,
151 HAMZA_BELOW,
152 ALEF_HAMZA_BELOW,
153 ALEF_HAMZA_ABOVE,
154 );
155 ALEFAT=(
156 ALEF,
157 ALEF_MADDA,
158 ALEF_HAMZA_ABOVE,
159 ALEF_HAMZA_BELOW,
160 ALEF_WASLA,
161 ALEF_MAKSURA,
162 SMALL_ALEF,
163
164 );
165 WEAK = ( ALEF, WAW, YEH, ALEF_MAKSURA);
166 YEHLIKE= ( YEH, YEH_HAMZA, ALEF_MAKSURA, SMALL_YEH );
167
168 WAWLIKE = ( WAW, WAW_HAMZA, SMALL_WAW );
169 TEHLIKE = ( TEH, TEH_MARBUTA );
170
171 SMALL =( SMALL_ALEF, SMALL_WAW, SMALL_YEH)
172 MOON =(HAMZA ,
173 ALEF_MADDA ,
174 ALEF_HAMZA_ABOVE ,
175 ALEF_HAMZA_BELOW ,
176 ALEF ,
177 BEH ,
178 JEEM ,
179 HAH ,
180 KHAH ,
181 AIN ,
182 GHAIN ,
183 FEH ,
184 QAF ,
185 KAF ,
186 MEEM ,
187 HEH ,
188 WAW ,
189 YEH
190 );
191 SUN=(
192 TEH ,
193 THEH ,
194 DAL ,
195 THAL ,
196 REH ,
197 ZAIN ,
198 SEEN ,
199 SHEEN ,
200 SAD ,
201 DAD ,
202 TAH ,
203 ZAH ,
204 LAM ,
205 NOON ,
206 );
207 AlphabeticOrder={
208 ALEF : 1,
209 BEH : 2,
210 TEH : 3,
211 TEH_MARBUTA : 3,
212 THEH : 4,
213 JEEM : 5,
214 HAH : 6,
215 KHAH : 7,
216 DAL : 8,
217 THAL : 9,
218 REH : 10,
219 ZAIN : 11,
220 SEEN : 12,
221 SHEEN : 13,
222 SAD : 14,
223 DAD : 15,
224 TAH : 16,
225 ZAH : 17,
226 AIN : 18,
227 GHAIN : 19,
228 FEH : 20,
229 QAF : 21,
230 KAF : 22,
231 LAM : 23,
232 MEEM : 24,
233 NOON : 25,
234 HEH : 26,
235 WAW : 27,
236 YEH : 28,
237 HAMZA : 29,
238
239 ALEF_MADDA : 29,
240 ALEF_HAMZA_ABOVE : 29,
241 WAW_HAMZA : 29,
242 ALEF_HAMZA_BELOW : 29,
243 YEH_HAMZA : 29,
244 }
245 NAMES ={
246 ALEF : u"ألف",
247 BEH : u"باء",
248 TEH : u'تاء' ,
249 TEH_MARBUTA : u'تاء مربوطة' ,
250 THEH : u'ثاء' ,
251 JEEM : u'جيم' ,
252 HAH : u'حاء' ,
253 KHAH : u'خاء' ,
254 DAL : u'دال' ,
255 THAL : u'ذال' ,
256 REH : u'راء' ,
257 ZAIN : u'زاي' ,
258 SEEN : u'سين' ,
259 SHEEN : u'شين' ,
260 SAD : u'صاد' ,
261 DAD : u'ضاد' ,
262 TAH : u'طاء' ,
263 ZAH : u'ظاء' ,
264 AIN : u'عين' ,
265 GHAIN : u'غين' ,
266 FEH : u'فاء' ,
267 QAF : u'قاف' ,
268 KAF : u'كاف' ,
269 LAM : u'لام' ,
270 MEEM : u'ميم' ,
271 NOON : u'نون' ,
272 HEH : u'هاء' ,
273 WAW : u'واو' ,
274 YEH : u'ياء' ,
275 HAMZA : u'همزة' ,
276
277 TATWEEL : u'تطويل' ,
278 ALEF_MADDA : u'ألف ممدودة' ,
279 ALEF_MAKSURA : u'ألف مقصورة' ,
280 ALEF_HAMZA_ABOVE : u'همزة على الألف' ,
281 WAW_HAMZA : u'همزة على الواو' ,
282 ALEF_HAMZA_BELOW : u'همزة تحت الألف' ,
283 YEH_HAMZA : u'همزة على الياء' ,
284 FATHATAN : u'فتحتان',
285 DAMMATAN : u'ضمتان',
286 KASRATAN : u'كسرتان',
287 FATHA : u'فتحة',
288 DAMMA : u'ضمة',
289 KASRA : u'كسرة',
290 SHADDA : u'شدة',
291 SUKUN : u'سكون',
292 }
293
294
295 HARAKAT_pattern =re.compile(ur"["+u"".join(HARAKAT)+u"]")
296 TASHKEEL_pattern =re.compile(ur"["+u"".join(TASHKEEL)+u"]")
297 HAMZAT_pattern =re.compile(ur"["+u"".join(HAMZAT)+u"]");
298 ALEFAT_pattern =re.compile(ur"["+u"".join(ALEFAT)+u"]");
299 LIGUATURES_pattern =re.compile(ur"["+u"".join(LIGUATURES)+u"]");
300
301
302
303
305 """Checks for Arabic Sukun Mark.
306 @param archar: arabic unicode char
307 @type archar: unicode
308 """
309 if archar==SUKUN:
310 return True;
311 else: return False;
312
314 """Checks for Arabic Shadda Mark.
315 @param archar: arabic unicode char
316 @type archar: unicode
317 """
318 if archar==SHADDA:
319 return True;
320 else: return False;
321
323 """Checks for Arabic Tatweel letter modifier.
324 @param archar: arabic unicode char
325 @type archar: unicode
326 """
327 if archar==TATWEEL:
328 return True;
329 else: return False;
331 """Checks for Arabic Tanwin Marks (FATHATAN, DAMMATAN, KASRATAN).
332 @param archar: arabic unicode char
333 @type archar: unicode
334 """
335 if archar in TANWIN:
336 return True;
337 else: return False;
338
340 """Checks for Arabic Tashkeel Marks (FATHA,DAMMA,KASRA, SUKUN, SHADDA, FATHATAN,DAMMATAN, KASRATAn).
341 @param archar: arabic unicode char
342 @type archar: unicode
343 """
344 if archar in TASHKEEL:
345 return True;
346 else: return False;
347
349 """Checks for Arabic Harakat Marks (FATHA,DAMMA,KASRA,SUKUN,TANWIN).
350 @param archar: arabic unicode char
351 @type archar: unicode
352 """
353 if archar in HARAKAT:
354 return True;
355 else: return False;
356
358 """Checks for Arabic short Harakat Marks (FATHA,DAMMA,KASRA,SUKUN).
359 @param archar: arabic unicode char
360 @type archar: unicode
361 """
362 if archar in SHORTHARAKAT:
363 return True;
364 else: return False;
365
367 """Checks for Arabic Ligatures like LamAlef.
368 (LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE)
369 @param archar: arabic unicode char
370 @type archar: unicode
371 """
372 if archar in LIGUATURES:
373 return True;
374 else: return False;
375
377 """Checks for Arabic Hamza forms.
378 HAMZAT are (HAMZA, WAW_HAMZA, YEH_HAMZA, HAMZA_ABOVE, HAMZA_BELOW,ALEF_HAMZA_BELOW, ALEF_HAMZA_ABOVE )
379 @param archar: arabic unicode char
380 @type archar: unicode
381 """
382 if archar in HAMZAT:
383 return True;
384 else: return False;
385
387 """Checks for Arabic Alef forms.
388 ALEFAT=(ALEF, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW,ALEF_WASLA, ALEF_MAKSURA );
389 @param archar: arabic unicode char
390 @type archar: unicode
391 """
392 if archar in ALEFAT:
393 return True;
394 else: return False;
395
397 """Checks for Arabic Yeh forms.
398 Yeh forms : YEH, YEH_HAMZA, SMALL_YEH, ALEF_MAKSURA
399 @param archar: arabic unicode char
400 @type archar: unicode
401 """
402 if archar in YEHLIKE:
403 return True;
404 else: return False;
405
407 """Checks for Arabic Waw like forms.
408 Waw forms : WAW, WAW_HAMZA, SMALL_WAW
409 @param archar: arabic unicode char
410 @type archar: unicode
411 """
412 if archar in WAWLIKE:
413 return True;
414 else: return False;
415
417 """Checks for Arabic Teh forms.
418 Teh forms : TEH, TEH_MARBUTA
419 @param archar: arabic unicode char
420 @type archar: unicode
421 """
422 if archar in TEHLIKE:
423 return True;
424 else: return False;
426 """Checks for Arabic Small letters.
427 SMALL Letters : SMALL ALEF, SMALL WAW, SMALL YEH
428 @param archar: arabic unicode char
429 @type archar: unicode
430 """
431 if archar in SMALL:
432 return True;
433 else: return False;
434
436 """Checks for Arabic Weak letters.
437 Weak Letters : ALEF, WAW, YEH, ALEF_MAKSURA
438 @param archar: arabic unicode char
439 @type archar: unicode
440 """
441 if archar in WEAK:
442 return True;
443 else: return False;
444
446 """Checks for Arabic Moon letters.
447 Moon Letters :
448 @param archar: arabic unicode char
449 @type archar: unicode
450 """
451
452 if archar in MOON:
453 return True;
454 else: return False;
455
457 """Checks for Arabic Sun letters.
458 Moon Letters :
459 @param archar: arabic unicode char
460 @type archar: unicode
461 """
462 if archar in SUN:
463 return True;
464 else: return False;
465
466
467
469 """return Arabic letter order between 1 and 29.
470 Alef order is 1, Yeh is 28, Hamza is 29.
471 Teh Marbuta has the same ordre with Teh, 3.
472 @param archar: arabic unicode char
473 @type archar: unicode
474 @return: arabic order.
475 @rtype: integer;
476 """
477 if AlphabeticOrder.has_key(archar):
478 return AlphabeticOrder[archar];
479 else: return 0;
480
482 """return Arabic letter name in arabic.
483 Alef order is 1, Yeh is 28, Hamza is 29.
484 Teh Marbuta has the same ordre with Teh, 3.
485 @param archar: arabic unicode char
486 @type archar: unicode
487 @return: arabic name.
488 @rtype: unicode;
489 """
490 if NAMES.has_key(archar):
491 return NAMES[archar];
492 else:
493 return u'';
494
496 """return a list of arabic characteres .
497 Return a list of characteres between \u060c to \u0652
498 @return: list of arabic characteres.
499 @rtype: unicode;
500 """
501 mylist=[];
502 for i in range(0x0600, 0x00653):
503 try :
504 mylist.append(unichr(i));
505 except ValueError:
506 pass;
507 return mylist;
508
509
510
511
512
514 """Checks if the arabic word contains shadda.
515 @param word: arabic unicode char
516 @type word: unicode
517 """
518 if re.search(SHADDA,word):
519 return True;
520 else:
521 return False;
522
523
524
525
527 """Checks if the arabic word is vocalized.
528 the word musn't have any spaces and pounctuations.
529 @param word: arabic unicode char
530 @type word: unicode
531 """
532 if word.isalpha(): return False;
533
534 else:
535 if re.search(HARAKAT_pattern,word):
536 return True;
537 else:
538 return False;
540 """Checks if the arabic text is vocalized.
541 The text can contain many words and spaces
542 @param text: arabic unicode char
543 @type text: unicode
544 """
545 if re.search(HARAKAT_pattern,text):
546 return True;
547 else:
548 return False;
550 """ Checks for an Arabic standard Unicode block characters;
551 An arabic string can contain spaces, digits and pounctuation.
552 but only arabic standard characters, not extended arabic
553 @param text: input text
554 @type text: unicode
555 @return: True if all charaters are in Arabic block
556 @rtype: Boolean
557 """
558 if re.search(u"([^\u0600-\u0652%s%s%s\s\d])"%(LAM_ALEF, LAM_ALEF_HAMZA_ABOVE,LAM_ALEF_MADDA_ABOVE),text):
559 return False;
560 return True;
561
563 """ Checks for an Arabic Unicode block characters;
564 @param text: input text
565 @type text: unicode
566 @return: True if all charaters are in Arabic block
567 @rtype: Boolean
568 """
569 if re.search(u"([^\u0600-\u06ff\ufb50-\ufdff\ufe70-\ufeff\u0750-\u077f])",text):
570 return False;
571 return True;
572
574 """ Checks for an valid Arabic word.
575 An Arabic word not contains spaces, digits and pounctuation
576 avoid some spelling error, TEH_MARBUTA must be at the end.
577 @param word: input word
578 @type word: unicode
579 @return: True if all charaters are in Arabic block
580 @rtype: Boolean
581 """
582 if len(word)==0 : return False;
583 elif re.search(u"([^\u0600-\u0652%s%s%s])"%(LAM_ALEF, LAM_ALEF_HAMZA_ABOVE,LAM_ALEF_MADDA_ABOVE),word):
584 return False;
585 elif isHaraka(word[0]) or word[0] in (WAW_HAMZA,YEH_HAMZA):
586 return False;
587
588 elif re.match(u"^(.)*[%s](.)+$"%ALEF_MAKSURA,word):
589 return False;
590 elif re.match(u"^(.)*[%s]([^%s%s%s])(.)+$"%(TEH_MARBUTA,DAMMA,KASRA,FATHA),word):
591 return False;
592 else:
593 return True;
594
595
596
598 """
599 Return the first char
600 @param word: given word;
601 @type word: unicode;
602 @return: the first char
603 @rtype: unicode char;
604 """
605 return word[0];
607 """
608 Return the second char
609 @param word: given word;
610 @type word: unicode;
611 @return: the first char
612 @rtype: unicode char;
613 """
614 return word[1:2];
616 """
617 Return the last letter
618 example: zerrouki; 'i' is the last.
619 @param word: given word;
620 @type word: unicode;
621 @return: the last letter
622 @rtype: unicode char;
623 """
624 return word[-1:];
626 """
627 Return the second last letter
628 example: zerrouki; 'k' is the second last.
629 @param word: given word;
630 @type word: unicode;
631 @return: the second last letter
632 @rtype: unicode char;
633 """
634 return word[-2:-1]
635
636
637
639 """Strip Harakat from arabic word except Shadda.
640 The striped marks are :
641 - FATHA, DAMMA, KASRA
642 - SUKUN
643 - FATHATAN, DAMMATAN, KASRATAN, , , .
644 Example:
645 >>> text=u"الْعَرَبِيّةُ"
646 >>> stripTashkeel(text)
647 العربيّة
648
649 @param text: arabic text.
650 @type text: unicode.
651 @return: return a striped text.
652 @rtype: unicode.
653 """
654 return re.sub(HARAKAT_pattern,u'',text)
655
657 """Strip vowels from a text, include Shadda.
658 The striped marks are :
659 - FATHA, DAMMA, KASRA
660 - SUKUN
661 - SHADDA
662 - FATHATAN, DAMMATAN, KASRATAN, , , .
663 Example:
664 >>> text=u"الْعَرَبِيّةُ"
665 >>> stripTashkeel(text)
666 العربية
667
668 @param text: arabic text.
669 @type text: unicode.
670 @return: return a striped text.
671 @rtype: unicode.
672 """
673 return re.sub(TASHKEEL_pattern,'',text);
674
676 """
677 Strip tatweel from a text and return a result text.
678
679 Example:
680 >>> text=u"العـــــربية"
681 >>> stripTatweel(text)
682 العربية
683
684 @param text: arabic text.
685 @type text: unicode.
686 @return: return a striped text.
687 @rtype: unicode.
688 """
689 return re.sub(TATWEEL,'',text);
690
692 """Normalize Lam Alef ligatures into two letters (LAM and ALEF), and Tand return a result text.
693 Some systems present lamAlef ligature as a single letter, this function convert it into two letters,
694 The converted letters into LAM and ALEF are :
695 - LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE
696
697 Example:
698 >>> text=u"لانها لالء الاسلام"
699 >>> normalizeLigature(text)
700 لانها لالئ الاسلام
701
702 @param text: arabic text.
703 @type text: unicode.
704 @return: return a converted text.
705 @rtype: unicode.
706 """
707 return LIGUATURES_pattern.sub(u'%s%s'%(LAM,ALEF), text)
709 """Standardize the Hamzat into one form of hamza,
710 replace Madda by hamza and alef.
711 Replace the LamAlefs by simplified letters.
712 Example:
713 >>> text=u"سئل أحد الأئمة"
714 >>> normalizeHamza(text)
715 سءل ءحد الءءمة
716
717 @param word: arabic text.
718 @type word: unicode.
719 @return: return a converted text.
720 @rtype: unicode.
721 """
722 HAMZAT= u"إأءئؤ";
723 if word.startswith(ALEF_MADDA):
724 if len(word)>=3 and (word[1] not in HARAKAT) and (word[2]==SHADDA or len(word)==3):
725 word=HAMZA+ALEF+word[1:];
726 else:
727 word=HAMZA+HAMZA+word[1:];
728
729 word=word.replace(ALEF_MADDA,HAMZA+HAMZA);
730 word=HAMZAT_pattern.sub(HAMZA,word);
731
732 return word;
733
734
735
737 """
738 separate the letters from the vowels, in arabic word,
739 if a letter hasn't a haraka, the not definited haraka is attributed.
740 return ( letters,vowels);
741 """
742
743 stack1=Stack(word)
744
745 stack1.items.reverse();
746 letters=Stack()
747 marks=Stack()
748 vowels=HARAKAT
749 last1=stack1.pop();
750
751
752
753 while last1 in vowels: last1=stack1.pop();
754 while last1!=None:
755 if last1 in vowels:
756
757
758 marks.pop();
759 marks.push(last1);
760 elif last1==SHADDA:
761
762
763
764 marks.pop();
765 marks.push(SUKUN);
766 marks.push(NOT_DEF_HARAKA);
767 letters.push(SHADDA);
768 else:
769 marks.push(NOT_DEF_HARAKA);
770 letters.push(last1);
771 last1=stack1.pop();
772 return (''.join(letters.items),''.join(marks.items))
773
774
775 -def joint(letters,marks):
776 """
777 joint the letters with the marks
778 the length ot letters and marks must be equal
779 return word;
780 """
781
782 debug=False;
783
784 if len(letters)!=len(marks): return "";
785
786 stackLetter=Stack(letters)
787 stackLetter.items.reverse();
788 stackMark=Stack(marks)
789 stackMark.items.reverse();
790 wordStack=Stack();
791 last1=stackLetter.pop();
792 last2=stackMark.pop();
793
794 vowels=HARAKAT
795 while last1!=None and last2!=None:
796 if last1 == SHADDA:
797 top=wordStack.pop();
798 if top not in vowels:
799 wordStack.push(top);
800 wordStack.push(last1);
801 if last2!= NOT_DEF_HARAKA:
802 wordStack.push(last2);
803 else:
804 wordStack.push(last1);
805 if last2!= NOT_DEF_HARAKA:
806 wordStack.push(last2);
807
808 last1=stackLetter.pop();
809 last2=stackMark.pop();
810 if not (stackLetter.isEmpty() and stackMark.isEmpty()):
811 return False;
812 else:
813
814 return ''.join(wordStack.items);
815
817 """
818 if the two words has the same letters and the same harakats, this fuction return True.
819 The two words can be full vocalized, or partial vocalized
820 """
821 debug=False;
822 stack1=Stack(word1)
823 stack2=Stack(word2)
824 last1=stack1.pop();
825 last2=stack2.pop();
826 if debug: print "+0", stack1, stack2;
827 vowels=HARAKAT
828 while last1!=None and last2!=None:
829 if last1==last2:
830 if debug: print "+2", stack1.items,last1, stack2.items,last2
831 last1=stack1.pop();
832 last2=stack2.pop();
833 elif last1 in vowels and last2 not in vowels:
834 if debug: print "+2", stack1.items,last1, stack2.items,last2
835 last1=stack1.pop();
836 elif last1 not in vowels and last2 in vowels:
837 if debug: print "+2", stack1.items,last1, stack2.items,last2
838 last2=stack2.pop();
839 else:
840 if debug: print "+2", stack1.items,last1, stack2.items,last2
841 break;
842 if not (stack1.isEmpty() and stack2.isEmpty()):
843 return False;
844 else: return True;
845
846
847
849 """
850 if the word1 is like a wazn (pattern),
851 the letters must be equal,
852 the wazn has FEH, AIN, LAM letters.
853 this are as generic letters.
854 The two words can be full vocalized, or partial vocalized
855 """
856 debug=False;
857 stack1=Stack(word1)
858 stack2=Stack(wazn)
859 root=Stack()
860 last1=stack1.pop();
861 last2=stack2.pop();
862 if debug: print "+0", stack1, stack2;
863 vowels=HARAKAT
864 while last1!=None and last2!=None:
865 if last1==last2 and last2 not in (FEH, AIN,LAM):
866 if debug: print "+2", stack1.items,last1, stack2.items,last2
867 last1=stack1.pop();
868 last2=stack2.pop();
869 elif last1 not in vowels and last2 in (FEH, AIN,LAM):
870 if debug: print "+2", stack1.items,last1, stack2.items,last2
871 root.push(last1);
872 print "t";
873 last1=stack1.pop();
874 last2=stack2.pop();
875 elif last1 in vowels and last2 not in vowels:
876 if debug: print "+2", stack1.items,last1, stack2.items,last2
877 last1=stack1.pop();
878 elif last1 not in vowels and last2 in vowels:
879 if debug: print "+2", stack1.items,last1, stack2.items,last2
880 last2=stack2.pop();
881 else:
882 if debug: print "+2", stack1.items,last1, stack2.items,last2
883 break;
884
885 root.items.reverse();
886 print " the root is ", root.items
887 if not (stack1.isEmpty() and stack2.isEmpty()):
888 return False;
889 else: return True;
890
892 """
893 if the two words has the same letters and the same harakats, this fuction return True.
894 The first word is partially vocalized, the second is fully
895 if the partially contians a shadda, it must be at the same place in the fully
896 """
897 debug=False;
898 partial=stripHarakat(partial);
899 fully=stripHarakat(fully)
900 Pstack=Stack(partial)
901 Vstack=Stack(fully)
902 Plast=Pstack.pop();
903 Vlast=Vstack.pop();
904 if debug: print "+0", Pstack, Vstack;
905 vowels=SHADDA
906 while Plast!=None and Vlast!=None:
907 if Plast==Vlast:
908 if debug: print "+2", Pstack.items,Plast, Vstack.items,Vlast
909 Plast=Pstack.pop();
910 Vlast=Vstack.pop();
911 elif Plast ==SHADDA and Vlast !=SHADDA:
912 if debug: print "+2", Pstack.items,Plast, Vstack.items,Vlast
913 break;
914 elif Plast !=SHADDA and Vlast ==SHADDA:
915 if debug: print "+2", Pstack.items,Plast, Vstack.items,Vlast
916 Vlast=Vstack.pop();
917 else:
918 if debug: print "+2", Pstack.items,Plast, Vstack.items,Vlast
919 break;
920 if not (Pstack.isEmpty() and Vstack.isEmpty()):
921 return False;
922 else: return True;
924 """
925 Reduce the Tashkeel, by deleting evident cases.
926 @param text: the input text fully vocalized.
927 @type text: unicode.
928 @return : partially vocalized text.
929 @rtype: unicode.
930 """
931 reduced=text;
932
933
934 reduced=re.sub(u"(?<!(%s|%s))(%s|%s)"%( WAW, YEH, SUKUN, FATHA),'',reduced);
935
936
937 reduced=re.sub(u"%s(?=%s)"%(DAMMA, WAW),'',reduced);
938
939
940 reduced=re.sub(u"%s(?=%s)"%(KASRA, YEH),'',reduced);
941
942
943 reduced=re.sub(u"%s(?=%s)"%(FATHA, ALEF),'',reduced);
944
945
946 reduced=re.sub(u"(?<=\s(%s|%s))%s"%(WAW, YEH, FATHA),'',reduced);
947 reduced=re.sub(u"(?<=\A(%s|%s))%s"%(WAW, YEH, FATHA),'',reduced);
948
949
950 reduced=re.sub(u"(?<=%s)%s"%(ALEF_HAMZA_BELOW,KASRA),'',reduced);
951
952 return reduced;
953