Source code for KWIC.queryParser

import re

[docs]def tokenize(string):
    """Parse query string for ngram into token objects
    
    Parameters
    ----------
    string : str
        Query string with each token enclosed in a pair 
        of square brackets. In each token, the tag ``word``
        and ``pos`` could be given as ``[word="他們" pos="N.*"]``.
        To search with regex in ``word``, append ``.regex`` to
        ``word``: ``[word.regex="們$" pos="N.*"]``.
        ``pos`` by default uses regex search.
    
    Returns
    -------
    list
        A list of token objects (dictionaries), with each dictionary
        representing the token in the query string (i.e. token enclosed 
        in the brackets). Each token has three key-value pairs:

        - `tk`: ``str``. The pattern of the word to search for.
        - `tk.regex`: ``bool``. Whether to use regex search with word.
        - `pos`: ``str``. The pattern of the pos tag to search for.
    """

    # Deal with single exact match of token
    if string.find("[") == -1:
        return [{
            'tk': string,
            'pos': None,
            'tk.regex': False,
        }]

    # Scan through the string to find matching brackets
    tokens = []
    openPos =[]
    depth = 0
    for i, char in enumerate(string):
        if char == '[':
            openPos.append(i)
            depth += 1
        if char == ']':
            start = openPos.pop()
            depth -= 1
            tokens.append({
                'start': start,
                'end': i,
                'inside': string[start+1:i],
                'depth': depth
            })
    # Get matching brackets at first depth level
    tk_pat = re.compile('''word=['"]([^'"]+)['"]''')
    pos_pat = re.compile('''pos=['"]([^'" ]+)['"]''')
    tkRegEx_pat = re.compile('''word.regex=['"]([^'"]+)['"]''')

    output = []
    for tk in tokens:
        if tk['depth'] == 0:
            token = tk_pat.findall(tk['inside'])
            tkRegEx = tkRegEx_pat.findall(tk['inside'])
            token = tkRegEx if tkRegEx else token
            pos = pos_pat.findall(tk['inside'])
            output.append({
                'tk': token[0] if len(token) > 0 else None,
                'pos': pos[0] if len(pos) > 0 else None,
                'tk.regex': True if tkRegEx else False,
            })
    return output

#%%
[docs]def querySpecificity(queryObj={'tk': '^我們$', 'pos': 'N%', 'tk.regex': True}):
    """Score a token object for specificity.
    
    Parameters
    ----------
    queryObj : dict
        A token object in a list returned by :py:func:`.tokenize`.
    
    Returns
    -------
    float
        A point indicating the specificity of the token. Higher score
        means the token is more specific and may result in fewer query
        results in the corpus. This point is used to determine the
        seed token of an ngram to search in the corpus (to boost 
        performance).
    """

    status = {
        'token': {
            'has_regEx': False,
            'zh_len': 0
        },
        'pos': {
            'has_wildcard': False,
            'tag_len': 0,
        }
    }
    #-------- Check token pattern --------#
    # List of regEx metacharacters indicating specific pattern
    regEx_meta = ['^', '$', '[', ']', '?' '{', '}', '(', ')', '|']
    if queryObj['tk.regex'] and \
       set(queryObj['tk']).intersection(regEx_meta):
        status['token']['has_regEx'] = True
    # Check chinese character
    if queryObj['tk'] is not None:
        for char in queryObj['tk']:
            if char > u'\u4e00' and char < u'\u9fff':
                status['token']['zh_len'] += 1

    #------ Check pos tag pattern --------#
    if queryObj['pos'] is not None:
        if queryObj['pos'].find('%') != -1:
            status['pos']['has_wildcard'] = True
        for char in queryObj['pos']:
            if re.match('[A-Za-z]', char):
                status['pos']['tag_len'] += 1
    
    return 1.2 * status['token']['zh_len'] + status['token']['has_regEx'] + \
        0.5 * status['pos']['tag_len'] - 0.2 * status['pos']['has_wildcard']
Source code for KWIC.queryParser

KWIC backend

Navigation

Related Topics