Source code for KWIC.queryParser
import re
[docs]def tokenize(string):
"""Parse query string for ngram into token objects
Parameters
----------
string : str
Query string with each token enclosed in a pair
of square brackets. In each token, the tag ``word``
and ``pos`` could be given as ``[word="他們" pos="N.*"]``.
To search with regex in ``word``, append ``.regex`` to
``word``: ``[word.regex="們$" pos="N.*"]``.
``pos`` by default uses regex search.
Returns
-------
list
A list of token objects (dictionaries), with each dictionary
representing the token in the query string (i.e. token enclosed
in the brackets). Each token has three key-value pairs:
- `tk`: ``str``. The pattern of the word to search for.
- `tk.regex`: ``bool``. Whether to use regex search with word.
- `pos`: ``str``. The pattern of the pos tag to search for.
"""
# Deal with single exact match of token
if string.find("[") == -1:
return [{
'tk': string,
'pos': None,
'tk.regex': False,
}]
# Scan through the string to find matching brackets
tokens = []
openPos =[]
depth = 0
for i, char in enumerate(string):
if char == '[':
openPos.append(i)
depth += 1
if char == ']':
start = openPos.pop()
depth -= 1
tokens.append({
'start': start,
'end': i,
'inside': string[start+1:i],
'depth': depth
})
# Get matching brackets at first depth level
tk_pat = re.compile('''word=['"]([^'"]+)['"]''')
pos_pat = re.compile('''pos=['"]([^'" ]+)['"]''')
tkRegEx_pat = re.compile('''word.regex=['"]([^'"]+)['"]''')
output = []
for tk in tokens:
if tk['depth'] == 0:
token = tk_pat.findall(tk['inside'])
tkRegEx = tkRegEx_pat.findall(tk['inside'])
token = tkRegEx if tkRegEx else token
pos = pos_pat.findall(tk['inside'])
output.append({
'tk': token[0] if len(token) > 0 else None,
'pos': pos[0] if len(pos) > 0 else None,
'tk.regex': True if tkRegEx else False,
})
return output
#%%
[docs]def querySpecificity(queryObj={'tk': '^我們$', 'pos': 'N%', 'tk.regex': True}):
"""Score a token object for specificity.
Parameters
----------
queryObj : dict
A token object in a list returned by :py:func:`.tokenize`.
Returns
-------
float
A point indicating the specificity of the token. Higher score
means the token is more specific and may result in fewer query
results in the corpus. This point is used to determine the
seed token of an ngram to search in the corpus (to boost
performance).
"""
status = {
'token': {
'has_regEx': False,
'zh_len': 0
},
'pos': {
'has_wildcard': False,
'tag_len': 0,
}
}
#-------- Check token pattern --------#
# List of regEx metacharacters indicating specific pattern
regEx_meta = ['^', '$', '[', ']', '?' '{', '}', '(', ')', '|']
if queryObj['tk.regex'] and \
set(queryObj['tk']).intersection(regEx_meta):
status['token']['has_regEx'] = True
# Check chinese character
if queryObj['tk'] is not None:
for char in queryObj['tk']:
if char > u'\u4e00' and char < u'\u9fff':
status['token']['zh_len'] += 1
#------ Check pos tag pattern --------#
if queryObj['pos'] is not None:
if queryObj['pos'].find('%') != -1:
status['pos']['has_wildcard'] = True
for char in queryObj['pos']:
if re.match('[A-Za-z]', char):
status['pos']['tag_len'] += 1
return 1.2 * status['token']['zh_len'] + status['token']['has_regEx'] + \
0.5 * status['pos']['tag_len'] - 0.2 * status['pos']['has_wildcard']