Source code for indicnlp.syllable.syllabifier

# 
#  Copyright (c) 2013-present, Anoop Kunchukuttan
#  All rights reserved.
#  
#  This source code is licensed under the MIT license found in the
#  LICENSE file in the root directory of this source tree.
# 

import codecs, sys
from indicnlp.script import indic_scripts as si 
import re

chillu_char_map= {
                    '\u0d7a': '\u0d23', 
                    '\u0d7b': '\u0d28',
                    '\u0d7c': '\u0d30',
                    '\u0d7d': '\u0d32',
                    '\u0d7e': '\u0d33',
                    '\u0d7f': '\u0d15',
                 }

char_chillu_map= {}
for k,v in chillu_char_map.items():
    char_chillu_map[v]=k

[docs]def normalize_malayalam(word): word_mask=re.sub(r'[0-9]','0',word) # instead of chillu characters, use consonant+halant for chillu,char in chillu_char_map.items(): word=word.replace(chillu,'{}\u0d4d'.format(char)) word_mask=word_mask.replace(chillu,'41') word_mask=re.sub(r'[^0-9]','0',word_mask) return word, word_mask
[docs]def denormalize_malayalam(word, word_mask): word=list(word) word_mask=list(word_mask) ## pattern 4 idx=0 while idx>=0: try: idx=word_mask.index('4',idx) word[idx:idx+2]=char_chillu_map[word[idx]] word_mask[idx:idx+2]='0' start=idx except ValueError as e: break return ''.join(word)
[docs]def normalize_punjabi(word): word_mask=re.sub(r'[0-9]','0',word) ## replace tippi with anusvaar word=word.replace('\u0a70','\u0a02') word_mask=word_mask.replace('\u0a70','2') ## replace addak+consonant with consonat+halant+consonant word=re.sub(r'\u0a71(.)','\\1\u0a4d\\1',word) word_mask=re.sub(r'\u0a71(.)','311',word_mask) word_mask=re.sub(r'[^0-9]','0',word_mask) return word, word_mask
[docs]def denormalize_punjabi(word, word_mask): word=list(word) word_mask=list(word_mask) ## pattern 2 idx=0 while idx>=0: try: idx=word_mask.index('2',idx) word[idx]='\u0a70' word_mask[idx]='0' start=idx except ValueError as e: break ## pattern 3 idx=0 while idx>=0: try: idx=word_mask.index('3',idx) word[idx:idx+3]='\u0a71{}'.format(word[idx]) word_mask[idx:idx+3]='00' start=idx except ValueError as e: break return ''.join(word)
[docs]def char_backoff(syllables_list,vocab): syllables_final=[] if vocab is None: syllables_final=syllables_list else: for s in syllables_list: if s in vocab: syllables_final.append(s) else: for x in s: syllables_final.append(x) return syllables_final
[docs]def orthographic_syllabify_improved(word,lang,vocab=None): word_mask=['0']*len(word) if lang=='ml': word, word_mask = normalize_malayalam(word) word=word elif lang=='pa': word, word_mask = normalize_punjabi(word) p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word] syllables=[] syllables_mask=[] for i in range(len(word)): v=p_vectors[i] syllables.append(word[i]) syllables_mask.append(word_mask[i]) ### simplified syllabification #if i+1<len(word) and \ # (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])): # syllables.append(u' ') # syllables_mask.append(u'0') #elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v): # syllables.append(u' ') # syllables_mask.append(u'0') #elif i+1<len(word) and \ # (si.is_consonant(v) or si.is_nukta(v)) and \ # (si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])): # syllables.append(u' ') # syllables_mask.append(u'0') #### better syllabification if i+1<len(word) and (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])): syllables.append(' ') syllables_mask.append('0') elif not si.is_valid(v) or si.is_misc(v) : syllables.append(' ') syllables_mask.append('0') elif si.is_vowel(v): anu_nonplos= ( i+2<len(word) and \ si.is_anusvaar(p_vectors[i+1]) and \ not si.is_plosive(p_vectors[i+2])\ ) anu_eow= ( i+2==len(word) and \ si.is_anusvaar(p_vectors[i+1]) ) if not(anu_nonplos or anu_eow): syllables.append(' ') syllables_mask.append('0') elif i+1<len(word) and \ (si.is_consonant(v) or si.is_nukta(v)): if si.is_consonant(p_vectors[i+1]): syllables.append(' ') syllables_mask.append('0') elif si.is_vowel(p_vectors[i+1]) and \ not si.is_dependent_vowel(p_vectors[i+1]): syllables.append(' ') syllables_mask.append('0') elif si.is_anusvaar(p_vectors[i+1]): anu_nonplos= ( i+2<len(word) and \ not si.is_plosive(p_vectors[i+2])\ ) anu_eow= i+2==len(word) if not(anu_nonplos or anu_eow): syllables.append(' ') syllables_mask.append('0') syllables_mask=''.join(syllables_mask) syllables=''.join(syllables) #assert len(syllables_mask) == len(syllables) #assert syllables_mask.find('01') == -1 if syllables_mask.find('01') >= 0: print('Warning') if lang=='ml': syllables = denormalize_malayalam(syllables,syllables_mask) elif lang=='pa': syllables = denormalize_punjabi(syllables,syllables_mask) syllables_list = syllables.strip().split(' ') return(char_backoff(syllables_list,vocab))
[docs]def orthographic_syllabify(word,lang,vocab=None): p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word] syllables=[] for i in range(len(word)): v=p_vectors[i] syllables.append(word[i]) ### simplified syllabification #if i+1<len(word) and \ # (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])): # syllables.append(u' ') #elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v): # syllables.append(u' ') #elif i+1<len(word) and \ # (si.is_consonant(v) or si.is_nukta(v)) and \ # (si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])): # syllables.append(u' ') #### better syllabification if i+1<len(word) and (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])): syllables.append(' ') elif not si.is_valid(v) or si.is_misc(v) : syllables.append(' ') elif si.is_vowel(v): anu_nonplos= ( i+2<len(word) and \ si.is_anusvaar(p_vectors[i+1]) and \ not si.is_plosive(p_vectors[i+2])\ ) anu_eow= ( i+2==len(word) and \ si.is_anusvaar(p_vectors[i+1]) ) if not(anu_nonplos or anu_eow): syllables.append(' ') elif i+1<len(word) and \ (si.is_consonant(v) or si.is_nukta(v)): if si.is_consonant(p_vectors[i+1]): syllables.append(' ') elif si.is_vowel(p_vectors[i+1]) and \ not si.is_dependent_vowel(p_vectors[i+1]): syllables.append(' ') elif si.is_anusvaar(p_vectors[i+1]): anu_nonplos= ( i+2<len(word) and \ not si.is_plosive(p_vectors[i+2])\ ) anu_eow= i+2==len(word) if not(anu_nonplos or anu_eow): syllables.append(' ') syllables_list = ''.join(syllables).strip().split(' ') return(char_backoff(syllables_list,vocab))
[docs]def orthographic_simple_syllabify(word,lang,vocab=None): p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word] syllables=[] for i in range(len(word)): v=p_vectors[i] syllables.append(word[i]) ## simplified syllabification if i+1<len(word) and \ (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])): syllables.append(' ') elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v): syllables.append(' ') elif i+1<len(word) and \ (si.is_consonant(v) or si.is_nukta(v)) and \ (si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])): syllables.append(' ') syllables_list = ''.join(syllables).strip().split(' ') return(char_backoff(syllables_list,vocab))