Question about Tashaphyne package in python

Discussion in 'Python' started by yomnasalah91@gmail.com, Mar 3, 2013.

  1. Guest

    I have a Python code that take an Arabic word and get the root and also remove diacritics, but i I have a problem with the output. For example : whenthe input is "العربيه" the output is:"عرب" which is right answer but when the input is "كاتب" the output is:"ب", and when the input is "يخاÙ" the output is " Ø®Ù".

    This is my code:

    # -*- coding=utf-8 -*-

    import re
    from arabic_const import *
    import Tashaphyne
    from Tashaphyne import *
    import enum
    from enum import Enum
    search_type=Enum('unvoc_word','voc_word','root_word')

    HARAKAT_pat = re.compile(ur"[" + u"".join([FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA]) + u"]")
    HAMZAT_pat = re.compile(ur"[" + u"".join([WAW_HAMZA, YEH_HAMZA]) + u"]");
    ALEFAT_pat = re.compile(ur"[" + u"".join([ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, HAMZA_ABOVE, HAMZA_BELOW]) + u"]");
    LAMALEFAT_pat = re.compile(ur"[" + u"".join([LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE]) + u"]");
    #--------------------------------------
    def strip_tashkeel(w):
    "strip vowel from a word and return a result word"
    return HARAKAT_pat.sub('', w)

    #strip tatweel from a word and return a result word
    #--------------------------------------
    def strip_tatweel(w):
    "strip tatweel from a word and return a result word"
    return re.sub(ur'[%s]' % TATWEEL, '', w)


    #--------------------------------------
    def normalize_hamza(w):
    "strip vowel from a word and return a result word"
    w = ALEFAT_pat.sub(ALEF, w)
    return HAMZAT_pat.sub(HAMZA, w)

    #--------------------------------------
    def normalize_lamalef(w):
    "strip vowel from a word and return a result word"
    return LAMALEFAT_pat.sub(u'%s%s' % (LAM, ALEF), w)

    #--------------------------------------
    def normalize_spellerrors(w):
    "strip vowel from a word and return a result word"
    w = re.sub(ur'[%s]' % TEH_MARBUTA, HEH, w)
    return re.sub(ur'[%s]' % ALEF_MAKSURA, YEH, w)
    def guess_stem(self,word):
    """
    Detetect affixed letters based or phonetic root composition..
    In Arabic language, there are some letters which can't be adjacent in a root.
    This function return True, if the word is valid, else, return False

    @param word: the word.
    @type word: unicode.
    @return: word with a '-' to indicate the stemming position.
    @rtype: unicode
    """
    # certain roots are forbiden in arabic
    #exprimed in letters sequences
    # but this sequence can be used for affixation
    #then we can guess that this letters are affixed
    #
    #treat one prefixe letter
    # we strip harkat and shadda
    word=ar_strip_marks(word);
    prefixes_letters=(TEH, MEEM,LAM,WAW,BEH, KAF,FEH,HAMZA,YEH,NOON)
    prefixes_forbiden={
    ALEF_HAMZA_ABOVE:(ALEF_HAMZA_ABOVE,ZAH,AIN,GHAIN),
    BEH:(BEH,FEH,MEEM),
    TEH:(THEH,DAL,THAL,ZAIN,SHEEN,SAD,DAD,TAH,ZAH),
    FEH:(BEH,FEH,MEEM),
    KAF:(JEEM,DAD,TAH,ZAH,QAF,KAF),
    LAM:(REH,SHEEN,LAM,NOON),
    MEEM:(BEH,FEH,MEEM),
    NOON:(REH,LAM,NOON),
    WAW:(WAW,YEH),
    YEH:(THEH,JEEM,HAH,KHAH,THAL,ZAIN,SHEEN,SAD,DAD,TAH,ZAH,GHAIN,KAF,HEH,YEH),
    }

    word_guess=word;
    if len(word)>=2:
    c1=word[0];
    c2=word[1];
    # if c1 in prefixes_letters and (c1 in prefixes_forbiden.keys() and c2 in prefixes_forbiden[c1]):
    if prefixes_forbiden.has_key(c1) and c2 in prefixes_forbiden[c1]:

    word_guess=u"%s-%s"%(c1,word[1:])
    if len(word_guess)>=4:
    c1=word_guess[2];
    c2=word_guess[3];
    if c1 in prefixes_letters and ( c2 in prefixes_forbiden[c1]):
    word_guess=u"%s-%s"%(c1,word_guess[2:])




    # treat two suffixe letters
    bisuffixes_letters=(KAF+MEEM,KAF+NOON,HEH+MEEM,HEH+NOON)

    bisuffixes_forbiden={
    HEH+MEEM:(ALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,HAH, KHAH, SAD, DAD, TAH,ZAH,AIN,GHAIN,HEH,YEH),
    KAF+MEEM:(ALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,JEEM, KHAH,ZAIN,SEEN, SHEEN,DAD, TAH,ZAH,GHAIN, FEH, QAF,KAF, LAM, NOON,HEH,YEH),
    HEH+NOON:(ALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,JEEM,HAH, KHAH, SAD, DAD, TAH,ZAH,AIN,GHAIN,HEH,YEH),
    KAF+NOON:(ALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,JEEM,HAH, KHAH,THAL,SHEEN,DAD, TAH,ZAH,AIN, GHAIN, QAF,KAF, NOON, HEH,YEH),

    }
    ## word_guess=word;
    word=word_guess;
    if len(word)>=3:
    bc_last=word[-2:];
    bc_blast=word[-3:-2]
    if bc_last in bisuffixes_letters:
    if bc_blast in bisuffixes_forbiden[bc_last]:
    word_guess=u"%s-%s"%(word[:-2],bc_last)

    # treat one suffixe letters
    suffixes_letters=(KAF,TEH,HEH)

    suffixes_forbiden={
    TEH:(THEH,JEEM,DAL,THAL,ZAIN,SHEEN,TAH,ZAH),
    KAF:(THEH,JEEM,KHAH, THAL,TAH,ZAH,GHAIN,QAF),
    HEH:(TEH,HAH,KHAH,DAL,REH,SEEN,SHEEN,SAD,ZAH,AIN,GHAIN),
    }
    word=word_guess;
    c_last=word[-1:];
    c_blast=word[-2:-1]
    if c_last in suffixes_letters:
    if c_blast in suffixes_forbiden[c_last]:
    word_guess=u"%s-%s"%(word[:-1],c_last)


    return word_guess;


    def normalize_text(word,searchtype):
    word = strip_tashkeel(word)
    print word
    word = strip_tatweel(word)
    print word
    word = normalize_lamalef(word)
    print word
    word = normalize_hamza(word)
    print word
    word = normalize_spellerrors(word)
    print word
    if searchtype==search_type.root_word.index:
    """ArListem=ArabicLightStemmer()
    stem=ArListem.lightStem(word)
    word=ArListem.get_stem()
    print word
    w=ArListem.get_prefix()
    print w
    word=ArListem.get_root()"""
    word=guess_stem(word,w)
    print word
    return word
     
    , Mar 3, 2013
    #1
    1. Advertising

  2. MRAB Guest

    On 2013-03-03 03:06, wrote:
    > I have a Python code that take an Arabic word and get the root and also remove diacritics, but i I have a problem with the output. For example : when the input is "العربيه" the output is:"عرب" which is right answer but when the input is "كاتب" the output is:"ب", and when the input is "يخاÙ" the output is " Ø®Ù".
    >
    > This is my code:
    >
    > # -*- coding=utf-8 -*-
    >
    > import re
    > from arabic_const import *
    > import Tashaphyne
    > from Tashaphyne import *
    > import enum
    > from enum import Enum
    > search_type=Enum('unvoc_word','voc_word','root_word')
    >
    > HARAKAT_pat = re.compile(ur"[" + u"".join([FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA]) + u"]")
    > HAMZAT_pat = re.compile(ur"[" + u"".join([WAW_HAMZA, YEH_HAMZA]) + u"]");
    > ALEFAT_pat = re.compile(ur"[" + u"".join([ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, HAMZA_ABOVE, HAMZA_BELOW]) + u"]");
    > LAMALEFAT_pat = re.compile(ur"[" + u"".join([LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE]) + u"]");
    >

    [snip]
    When you're using Unicode with re in Python 2, you should include the
    re.UNICODE flag. For example:

    HARAKAT_pat = re.compile(ur"[" + u"".join([FATHATAN, DAMMATAN, KASRATAN,
    FATHA, DAMMA, KASRA, SUKUN, SHADDA]) + u"]", flags=re.UNICODE)

    or:

    HARAKAT_pat = re.compile(ur"(?u)[" + u"".join([FATHATAN, DAMMATAN,
    KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA]) + u"]")

    I don't know whether that will make a difference in this case because I
    don't know Tashaphyne or Arabic.
     
    MRAB, Mar 3, 2013
    #2
    1. Advertising

Want to reply to this thread or ask your own question?

It takes just 2 minutes to sign up (and it's free!). Just click the sign up button to choose a username and then you can ask your own questions on the forum.
Similar Threads
  1. Parvinder
    Replies:
    6
    Views:
    750
    Thomas G. Marshall
    Feb 27, 2005
  2. Dave
    Replies:
    2
    Views:
    473
  3. George P
    Replies:
    3
    Views:
    687
    Alex Martelli
    Sep 11, 2004
  4. David Pratt
    Replies:
    4
    Views:
    336
    David Pratt
    May 13, 2006
  5. Klein Stéphane
    Replies:
    3
    Views:
    596
    Steve Holden
    Dec 20, 2009
Loading...

Share This Page