Question about Tashaphyne package in python

Y

yomnasalah91

I have a Python code that take an Arabic word and get the root and also remove diacritics, but i I have a problem with the output. For example : whenthe input is "العربيه" the output is:"عرب" which is right answer but when the input is "كاتب" the output is:"ب", and when the input is "يخاÙ" the output is " Ø®Ù".

This is my code:

# -*- coding=utf-8 -*-

import re
from arabic_const import *
import Tashaphyne
from Tashaphyne import *
import enum
from enum import Enum
search_type=Enum('unvoc_word','voc_word','root_word')

HARAKAT_pat = re.compile(ur"[" + u"".join([FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA]) + u"]")
HAMZAT_pat = re.compile(ur"[" + u"".join([WAW_HAMZA, YEH_HAMZA]) + u"]");
ALEFAT_pat = re.compile(ur"[" + u"".join([ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, HAMZA_ABOVE, HAMZA_BELOW]) + u"]");
LAMALEFAT_pat = re.compile(ur"[" + u"".join([LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE]) + u"]");
#--------------------------------------
def strip_tashkeel(w):
"strip vowel from a word and return a result word"
return HARAKAT_pat.sub('', w)

#strip tatweel from a word and return a result word
#--------------------------------------
def strip_tatweel(w):
"strip tatweel from a word and return a result word"
return re.sub(ur'[%s]' % TATWEEL, '', w)


#--------------------------------------
def normalize_hamza(w):
"strip vowel from a word and return a result word"
w = ALEFAT_pat.sub(ALEF, w)
return HAMZAT_pat.sub(HAMZA, w)

#--------------------------------------
def normalize_lamalef(w):
"strip vowel from a word and return a result word"
return LAMALEFAT_pat.sub(u'%s%s' % (LAM, ALEF), w)

#--------------------------------------
def normalize_spellerrors(w):
"strip vowel from a word and return a result word"
w = re.sub(ur'[%s]' % TEH_MARBUTA, HEH, w)
return re.sub(ur'[%s]' % ALEF_MAKSURA, YEH, w)
def guess_stem(self,word):
"""
Detetect affixed letters based or phonetic root composition..
In Arabic language, there are some letters which can't be adjacent in a root.
This function return True, if the word is valid, else, return False

@param word: the word.
@type word: unicode.
@return: word with a '-' to indicate the stemming position.
@rtype: unicode
"""
# certain roots are forbiden in arabic
#exprimed in letters sequences
# but this sequence can be used for affixation
#then we can guess that this letters are affixed
#
#treat one prefixe letter
# we strip harkat and shadda
word=ar_strip_marks(word);
prefixes_letters=(TEH, MEEM,LAM,WAW,BEH, KAF,FEH,HAMZA,YEH,NOON)
prefixes_forbiden={
ALEF_HAMZA_ABOVE:(ALEF_HAMZA_ABOVE,ZAH,AIN,GHAIN),
BEH:(BEH,FEH,MEEM),
TEH:(THEH,DAL,THAL,ZAIN,SHEEN,SAD,DAD,TAH,ZAH),
FEH:(BEH,FEH,MEEM),
KAF:(JEEM,DAD,TAH,ZAH,QAF,KAF),
LAM:(REH,SHEEN,LAM,NOON),
MEEM:(BEH,FEH,MEEM),
NOON:(REH,LAM,NOON),
WAW:(WAW,YEH),
YEH:(THEH,JEEM,HAH,KHAH,THAL,ZAIN,SHEEN,SAD,DAD,TAH,ZAH,GHAIN,KAF,HEH,YEH),
}

word_guess=word;
if len(word)>=2:
c1=word[0];
c2=word[1];
# if c1 in prefixes_letters and (c1 in prefixes_forbiden.keys() and c2 in prefixes_forbiden[c1]):
if prefixes_forbiden.has_key(c1) and c2 in prefixes_forbiden[c1]:

word_guess=u"%s-%s"%(c1,word[1:])
if len(word_guess)>=4:
c1=word_guess[2];
c2=word_guess[3];
if c1 in prefixes_letters and ( c2 in prefixes_forbiden[c1]):
word_guess=u"%s-%s"%(c1,word_guess[2:])




# treat two suffixe letters
bisuffixes_letters=(KAF+MEEM,KAF+NOON,HEH+MEEM,HEH+NOON)

bisuffixes_forbiden={
HEH+MEEM:(ALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,HAH, KHAH, SAD, DAD, TAH,ZAH,AIN,GHAIN,HEH,YEH),
KAF+MEEM:(ALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,JEEM, KHAH,ZAIN,SEEN, SHEEN,DAD, TAH,ZAH,GHAIN, FEH, QAF,KAF, LAM, NOON,HEH,YEH),
HEH+NOON:(ALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,JEEM,HAH, KHAH, SAD, DAD, TAH,ZAH,AIN,GHAIN,HEH,YEH),
KAF+NOON:(ALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,JEEM,HAH, KHAH,THAL,SHEEN,DAD, TAH,ZAH,AIN, GHAIN, QAF,KAF, NOON, HEH,YEH),

}
## word_guess=word;
word=word_guess;
if len(word)>=3:
bc_last=word[-2:];
bc_blast=word[-3:-2]
if bc_last in bisuffixes_letters:
if bc_blast in bisuffixes_forbiden[bc_last]:
word_guess=u"%s-%s"%(word[:-2],bc_last)

# treat one suffixe letters
suffixes_letters=(KAF,TEH,HEH)

suffixes_forbiden={
TEH:(THEH,JEEM,DAL,THAL,ZAIN,SHEEN,TAH,ZAH),
KAF:(THEH,JEEM,KHAH, THAL,TAH,ZAH,GHAIN,QAF),
HEH:(TEH,HAH,KHAH,DAL,REH,SEEN,SHEEN,SAD,ZAH,AIN,GHAIN),
}
word=word_guess;
c_last=word[-1:];
c_blast=word[-2:-1]
if c_last in suffixes_letters:
if c_blast in suffixes_forbiden[c_last]:
word_guess=u"%s-%s"%(word[:-1],c_last)


return word_guess;


def normalize_text(word,searchtype):
word = strip_tashkeel(word)
print word
word = strip_tatweel(word)
print word
word = normalize_lamalef(word)
print word
word = normalize_hamza(word)
print word
word = normalize_spellerrors(word)
print word
if searchtype==search_type.root_word.index:
"""ArListem=ArabicLightStemmer()
stem=ArListem.lightStem(word)
word=ArListem.get_stem()
print word
w=ArListem.get_prefix()
print w
word=ArListem.get_root()"""
word=guess_stem(word,w)
print word
return word
 
M

MRAB

I have a Python code that take an Arabic word and get the root and also remove diacritics, but i I have a problem with the output. For example : when the input is "العربيه" the output is:"عرب" which is right answer but when the input is "كاتب" the output is:"ب", and when the input is "يخاÙ" the output is " Ø®Ù".

This is my code:

# -*- coding=utf-8 -*-

import re
from arabic_const import *
import Tashaphyne
from Tashaphyne import *
import enum
from enum import Enum
search_type=Enum('unvoc_word','voc_word','root_word')

HARAKAT_pat = re.compile(ur"[" + u"".join([FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA]) + u"]")
HAMZAT_pat = re.compile(ur"[" + u"".join([WAW_HAMZA, YEH_HAMZA]) + u"]");
ALEFAT_pat = re.compile(ur"[" + u"".join([ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, HAMZA_ABOVE, HAMZA_BELOW]) + u"]");
LAMALEFAT_pat = re.compile(ur"[" + u"".join([LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE]) + u"]");
[snip]
When you're using Unicode with re in Python 2, you should include the
re.UNICODE flag. For example:

HARAKAT_pat = re.compile(ur"[" + u"".join([FATHATAN, DAMMATAN, KASRATAN,
FATHA, DAMMA, KASRA, SUKUN, SHADDA]) + u"]", flags=re.UNICODE)

or:

HARAKAT_pat = re.compile(ur"(?u)[" + u"".join([FATHATAN, DAMMATAN,
KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA]) + u"]")

I don't know whether that will make a difference in this case because I
don't know Tashaphyne or Arabic.
 

Members online

Forum statistics

Threads
473,743
Messages
2,569,478
Members
44,898
Latest member
BlairH7607

Latest Threads

Top