Source code for indicnlp.script.english_script

# 
#  Copyright (c) 2013-present, Anoop Kunchukuttan
#  All rights reserved.
#  
#  This source code is licensed under the MIT license found in the
#  LICENSE file in the root directory of this source tree.
# 

import pandas as pd
import numpy as np

from indicnlp import common
from indicnlp.common import IndicNlpException


#### Maps from ARPABET to Internal Id
ARPABET_ID_MAP={}
ID_ARPABET_MAP={}


###
# Phonetic Information about script characters 
###

""" Phonetic data for English """
ENGLISH_PHONETIC_DATA=None

""" Phonetic vector for English"""
ENGLISH_PHONETIC_VECTORS=None

""" Length of phonetic vector """
PHONETIC_VECTOR_LENGTH=38

""" Start offset for the phonetic feature vector in the phonetic data vector """
PHONETIC_VECTOR_START_OFFSET=6

## PHONETIC PROPERTIES in order in which they occur in the vector 
## This list must be in sync with the keys in the PV_PROP_RANGES dictionary 
PV_PROP=['basic_type',
    'vowel_length',
    'vowel_strength',
    'vowel_status',
    'consonant_type',
    'articulation_place',
    'aspiration',
    'voicing',
    'nasalization',
    'vowel_horizontal',
    'vowel_vertical',
    'vowel_roundness',
    ]

### 
# Bit vector ranges for various properties 
###

PV_PROP_RANGES={
        'basic_type': [0,6],
        'vowel_length': [6,8],
        'vowel_strength': [8,11],
        'vowel_status': [11,13],
        'consonant_type': [13,18],
        'articulation_place': [18,23],
        'aspiration': [23,25],
        'voicing': [25,27],
        'nasalization': [27,29],
        'vowel_horizontal': [29,32],
        'vowel_vertical': [32,36],
        'vowel_roundness': [36,38],
        }


####
# Indexes into the Phonetic Vector 
####
PVIDX_BT_VOWEL=0
PVIDX_BT_CONSONANT=1
PVIDX_BT_NUKTA=2
PVIDX_BT_HALANT=3
PVIDX_BT_ANUSVAAR=4
PVIDX_BT_MISC=5
PVIDX_BT_S=PVIDX_BT_VOWEL 
PVIDX_BT_E=PVIDX_BT_MISC+1

PVIDX_VSTAT_DEP=12

####
SCRIPT_RANGE_START=0x0D00
## TBD
SCRIPT_RANGE_END=0x0D2E


[docs]def init():
    """
    To be called by library loader, do not call it in your program 
    """

    global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET

    ENGLISH_PHONETIC_DATA=pd.read_csv(common.get_resources_path()+'/script/english_script_phonetic_data.csv',encoding='utf-8')    

    ENGLISH_PHONETIC_VECTORS=ENGLISH_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values

    PHONETIC_VECTOR_LENGTH=ENGLISH_PHONETIC_VECTORS.shape[1]

    ### Load mapping from ARPABET representation of phoneme to internal ID
    global ARPABET_ID_MAP, ID_ARPABET_MAP

    with open(common.get_resources_path()+'/script/english_arpabet_list.csv','r',encoding='utf-8') as infile: 
        for ph_id, name in enumerate(iter(infile)): 
            name=name.strip()
            ARPABET_ID_MAP[name]=ph_id
            ID_ARPABET_MAP[ph_id]=name


[docs]def phoneme_to_offset(ph): 
    return ARPABET_ID_MAP[ph]

[docs]def offset_to_phoneme(ph_id): 
    return ID_ARPABET_MAP[ph_id]

[docs]def phoneme_to_enc(ph): 
    return chr(SCRIPT_RANGE_START+phoneme_to_offset(ph))

[docs]def enc_to_phoneme(ph): 
    return offset_to_phoneme(enc_to_offset(ph))

[docs]def enc_to_offset(c): 
    return ord(c)-SCRIPT_RANGE_START

[docs]def in_range(offset): 
    return offset>=SCRIPT_RANGE_START and offset<SCRIPT_RANGE_END 

[docs]def get_phonetic_info(lang): 
    return (ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS)

[docs]def invalid_vector():
    ##  TODO: check if np datatype is correct?
    return np.array([0]*PHONETIC_VECTOR_LENGTH)

[docs]def get_phonetic_feature_vector(p,lang):

    offset=enc_to_offset(p) 

    if not in_range(offset): 
        return invalid_vector()

    phonetic_data, phonetic_vectors= get_phonetic_info(lang)

    if phonetic_data.iloc[offset]['Valid Vector Representation']==0: 
        return invalid_vector()

    return phonetic_vectors[offset]