Source code for indicnlp.transliterate.script_unifier

# 
#  Copyright (c) 2013-present, Anoop Kunchukuttan
#  All rights reserved.
#  
#  This source code is licensed under the MIT license found in the
#  LICENSE file in the root directory of this source tree.
# 

#Program for normalization of text written in Unicode. This is mainly geared towards Indic scripts 
#
# @author Anoop Kunchukuttan 
#

import sys
from indicnlp.normalize import indic_normalize
from indicnlp.transliterate import unicode_transliterate
from indicnlp import loader

[docs]class AggressiveScriptUnifier(): def __init__(self,common_lang='hi',nasals_mode='to_nasal_consonants'): self.common_lang=common_lang self.nasals_mode=nasals_mode self.do_normalize_chandras=True self.do_normalize_vowel_ending=True self.remove_nuktas=True self.normalizer_map={} self._init_normalizers() def _init_normalizers(self): normalizer_factory=indic_normalize.IndicNormalizerFactory() ## for languages with common parameters for lang in ['hi','mr','sa','kK','ne','sd','bn','gu','ta','te','kn']: self.normalizer_map[lang]=normalizer_factory.get_normalizer(lang, nasals_mode=self.nasals_mode, do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, do_normalize_vowel_ending=self.do_normalize_vowel_ending) ## for languages with language specific parameters self.normalizer_map['pa']=normalizer_factory.get_normalizer('pa', nasals_mode=self.nasals_mode, do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, do_normalize_vowel_ending=self.do_normalize_vowel_ending, do_canonicalize_addak=True, do_canonicalize_tippi=True, do_replace_vowel_bases=True) self.normalizer_map['or']=normalizer_factory.get_normalizer('or', nasals_mode=self.nasals_mode, do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, do_normalize_vowel_ending=self.do_normalize_vowel_ending, do_remap_wa=True) self.normalizer_map['as']=normalizer_factory.get_normalizer('as', nasals_mode=self.nasals_mode, do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, do_normalize_vowel_ending=self.do_normalize_vowel_ending, do_remap_assamese_chars=True) self.normalizer_map['ml']=normalizer_factory.get_normalizer('ml', nasals_mode=self.nasals_mode, do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, do_normalize_vowel_ending=self.do_normalize_vowel_ending, do_canonicalize_chillus=True, do_correct_geminated_T=True)
[docs] def transform(self,text,lang): text=self.normalizer_map[lang].normalize(text) text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang) return text
[docs]class BasicScriptUnifier(): def __init__(self,common_lang='hi',nasals_mode='do_nothing'): self.common_lang=common_lang self.nasals_mode=nasals_mode self.normalizer_map={} self._init_normalizers() def _init_normalizers(self): normalizer_factory=indic_normalize.IndicNormalizerFactory() for lang in ['hi','mr','sa','kK','ne','sd','bn','gu','ta','te','kn','pa','or','as','ml']: self.normalizer_map[lang]=normalizer_factory.get_normalizer(lang, nasals_mode=self.nasals_mode)
[docs] def transform(self,text,lang): if lang in self.normalizer_map: text=self.normalizer_map[lang].normalize(text) text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang) return text
[docs]class NaiveScriptUnifier(): def __init__(self,common_lang='hi'): self.common_lang=common_lang
[docs] def transform(self,text,lang): text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang) return text
if __name__ == '__main__': loader.load() if len(sys.argv)<=4: print("Usage: python script_unifier <command> <infile> <outfile> <language>") sys.exit(1) if sys.argv[1]=='aggressive': language=sys.argv[4] unifier=AggressiveScriptUnifier(nasals_mode='to_nasal_consonants') with open(sys.argv[2],'r',encoding='utf-8') as ifile: with open(sys.argv[3],'w',encoding='utf-8') as ofile: for i, line in enumerate(ifile.readlines()): line=line.strip() transliterated_line=unifier.transform(line,language) ofile.write(transliterated_line+'\n') elif sys.argv[1]=='moderate': language=sys.argv[4] unifier=AggressiveScriptUnifier(nasals_mode='do_nothing') with open(sys.argv[2],'r',encoding='utf-8') as ifile: with open(sys.argv[3],'w',encoding='utf-8') as ofile: for i, line in enumerate(ifile.readlines()): line=line.strip() transliterated_line=unifier.transform(line,language) ofile.write(transliterated_line+'\n') elif sys.argv[1]=='basic': language=sys.argv[4] unifier=BasicScriptUnifier() with open(sys.argv[2],'r',encoding='utf-8') as ifile: with open(sys.argv[3],'w',encoding='utf-8') as ofile: for i, line in enumerate(ifile.readlines()): line=line.strip() transliterated_line=unifier.transform(line,language) ofile.write(transliterated_line+'\n') elif sys.argv[1]=='naive': language=sys.argv[4] unifier=NaiveScriptUnifier() with open(sys.argv[2],'r',encoding='utf-8') as ifile: with open(sys.argv[3],'w',encoding='utf-8') as ofile: for i, line in enumerate(ifile.readlines()): line=line.strip() transliterated_line=unifier.transform(line,language) ofile.write(transliterated_line+'\n')