Source code for KWIC.queryDB

#%%
import sqlite3

#------- Helpers ----------#
def sentPos2textPos(sent_len_lst, sent_id, position):
    if sent_id == 0:
        return position
    for i in range(sent_id):
        position += sent_len_lst[i]
    return position


#%%
import json
import re
import pandas as pd

[docs]class Corpus():
    """Query corpus from sqlite database
    """

    def __init__(self, db='data/asbc.sqlite', corp="data/asbc_lite.jsonl"):
        """Initialize a corpus for query
        
        Parameters
        ----------
        db : str, optional
            Relative path to the sqlite database of the corpus, 
            by default 'data/asbc.sqlite'.
        corp : str, optional
            Relative path to the jsonl file of the corpus, 
            by default "data/asbc_lite.jsonl". This file is
            read into memory to enable fast locating of kwic
            in the corpus.
        """

        def functionRegex(pattern, value):
            pat = re.compile(r"\b" + pattern + r"\b")
            #pat = re.compile(pattern)
            return pat.search(value) is not None
        
        # sqlite corpus
        conn = sqlite3.connect(db)
        conn.create_function("REGEXP", 2, functionRegex)
        # Connection object of sqlite3
        self.conn = conn
        self.cursor = conn.cursor()

        # Get column names of tables
        conn.commit()

        # jsonl corpus path
        with open(corp) as f:
            self.corp = [json.loads(line) for line in f ]
    

[docs]    def queryOneGram(self, token, pos, matchOpr={'token': '=', 'pos': 'REGEXP'}, gender=None):
        """Query KWIC of one token
        
        Parameters
        ----------
        token : str
            RegEx pattern of the keyword's form.
        pos : str
            RegEx pattern of the keyword's PoS tag. E.g., to 
            search for:

            - Nouns, use ``N.*``
            - Verbs, use ``V.*``

            See the tag set `here <https://github.com/ckiplab/ckiptagger/wiki/POS-Tags>`_.
        matchOpr: dict
            The operator ``<opr>`` given to the SQL command in 
            ``WHERE x <opr> pattern``. Could be one of ``=`` (exact match),
            ``REGEXP`` (uses RegEx to match pattern), or 
            ``LIKE`` (uses ``%`` to match pattern).
            Defaults to exact match for ``token`` and sql pattern for ``pos``.
        gender: int, optional
            Pre-filter SQL database based on the sex of the texts authors.

            - ``0``: female
            - ``1``: male
            - other values: all (no filter)

        Returns
        -------
        pandas.DataFrame
            A pandas dataframe for matching keywords and their
            positional information in the corpus.
        """

        # Add gender for Dcard
        if gender is not None:
            head = '''
                SELECT text_id, sent_id, position, token_id, pos_id FROM oneGram
                    WHERE gender = {} AND '''.format(gender)
        else:
            head = 'SELECT text_id, sent_id, position, token_id, pos_id FROM oneGram WHERE'

        # Optimize search 
        if (token is not None) and (pos is not None):
            sqlQuery = f"""{head}
                        (token_id IN (SELECT token_id FROM token 
                                    WHERE token {matchOpr['token']} ?) ) AND
                        (pos_id IN (SELECT pos_id FROM pos 
                                    WHERE pos {matchOpr['pos']} ?) )
                """
            q = (token, pos)
        elif (token is not None) and (pos is None):
            sqlQuery = f"""{head}
                    token_id IN (SELECT token_id FROM token WHERE token {matchOpr['token']} ?)
                    """
            q = (token, )
        elif (token is None) and (pos is not None):
            sqlQuery = f"""{head}
                    pos_id IN (SELECT pos_id FROM pos WHERE pos {matchOpr['pos']} ?)
                    """
            q = (pos, )
        else:
            raise Exception("Error in queryDB.py:line 98")
            return 1
        
        rows = self.cursor.execute(sqlQuery, q)
        self.conn.commit()

        return pd.DataFrame(data=rows, columns=['text_id', 'sent_id', 'position', 'token_id', 'pos_id'])

[docs]    def getNgram(self, text_id, sent_id, position, anchor={'n': 4, 'seed': 1}):
        """Get the ngram of a seed token from the in-memory corpus
        
        The three parameters ``text_id``, ``sent_id``, and ``position`` together
        locates the position of a seed token in the corpus. The info about the ngram
        in which this seed token lies is saved in the parameter ``anchor``.

        Parameters
        ----------
        text_id : int
            The index of the text in the corpus.
        sent_id : int
            The index of the sentence in the text.
        position : int
            The index of the token in the sentence.
        anchor : dict, optional
            Information about the seed token's ngram, by default 
            {'n': 4, 'seed': 1}.

            - ``seed``: The token's position in the ngram 
            - ``n``:  The ngram's length
        
        Returns
        -------
        list
            An ngram stored as (word, tag) pairs in a list.
        """

        sent = self.corp[text_id][sent_id]
        ngram_idx_start = position - anchor['seed']
        ngram = sent[ngram_idx_start:(ngram_idx_start + anchor['n'])]
        if len(ngram) != anchor['n']:
            return None
        return ngram

    def _getQueryMatchSet(self, query):
        matchOpr = {'token': '=', 'pos': 'REGEXP'}
        out = []
        for q in query:
            if q['tk.regex']:
                matchOpr['token'] = 'REGEXP'
            else:
                matchOpr['token'] = '='
            # Query DB for matching tags
            matching_tk = []
            matching_pos = []
            if q['tk'] is not None:
                matching_tk = self.conn.execute(f"""
                    SELECT token from token WHERE token {matchOpr['token']} ?
                    """, (q['tk'],) )
            if q['pos'] is not None:
                matching_pos = self.conn.execute(f"""
                    SELECT pos from pos WHERE pos {matchOpr['pos']} ?
                    """, (q['pos'],) )

            # Convert to python set
            matching_tk = set(t[0] for t in matching_tk)
            matching_pos = set(t[0] for t in matching_pos)
            out.append({'tk': matching_tk, 'pos': matching_pos})
        return out

[docs]    def queryNgram(self, query, anchor={'n': 2, 'seed': 1}, gender=None):
        """Query KWIC of phrases
        
        Parameters
        ----------
        query : list
            A list of token objects (dictionaries), with each dictionary
            representing the token in the query string (i.e. token enclosed 
            in the brackets). Returned by :py:func:`queryParser.tokenize`.
        anchor : dict, optional
            Passed to ``anchor`` in :py:meth:`.getNgram`, 
            by default {'n': 2, 'seed': 1}.
        gender : int, optional
            Passed to ``gender`` in :py:meth:`.queryOneGram`, by default None.
        
        Returns
        -------
        pandas.DataFrame
            A pandas dataframe for matching keywords and their
            positional information in the corpus.
        """

        # Query Seed Token
        seed_tk = query[anchor['seed']]['tk']
        seed_pos = query[anchor['seed']]['pos']
        if query[anchor['seed']]['tk.regex']:
            matchOpr = {'token': 'REGEXP', 'pos': 'REGEXP'}
        else:
            matchOpr = {'token': '=', 'pos': 'REGEXP'}
        oneGram = self.queryOneGram(token=seed_tk, pos=seed_pos, matchOpr=matchOpr, gender=gender)

        # Scan through ngrams of the seed token
        valid_rows = []
        queryMatchSet = self._getQueryMatchSet(query)
        for idx, row in oneGram.iterrows():
            ngram = self.getNgram(row.text_id, row.sent_id, row.position, anchor)
            if ngram:  # ngram successfully extracted from sent
                valid = True
                for i in range(len(ngram)):
                    ngram_tk = ngram[i][0]
                    ngram_pos = ngram[i][1]
                    # Check whether token and pos match between query ngram and corpus ngram
                    # If user didn't specify token or pos (i.e. None), they are treated
                    # as equal to whatever tokens or tags are in the corpus
                    tk_equal, pos_equal = False, False
                    if (query[i]['tk'] is None) or (ngram_tk in queryMatchSet[i]['tk']):
                        tk_equal = True
                    if (query[i]['pos'] is None) or (ngram_pos in queryMatchSet[i]['pos']):
                        pos_equal = True
                    if not (tk_equal and pos_equal):
                        valid = False
                        break
            else:
                valid = False
            if valid:
                valid_rows.append(idx)
        
        return oneGram.iloc[valid_rows]


[docs]    def concordance(self, text_id, sent_id, position, n=1, left=10, right=10):
        """Retrive all KWIC instances from corpus based on positional information
        
        Parameters
        ----------
        text_id : int
            One of a index of the items (text level of the corpus) in
            the first level of :py:attr:`.corpus`. This is the index
            indicating the order of the texts in the corpus.
        sent_id : int
            One of a index of the items (sentence level of the corpus)
            in the second level of :py:attr:`.corpus`. 
            This is the index indicating the order of the sentences in
            a text.
        position : int
            One of a index of the items (word level of the corpus)
            in the third level of :py:attr:`.corpus`. 
            This is the index indicating the order of the words in
            a sentence.
        n : int, optional
            Keyword length, by default 1
        left : int, optional
            Left context size, in number of tokens, by default 10
        right : int, optional
            Right context size, in number of tokens, by default 10
        
        Returns
        -------
        dict
            A dictionary with:
            
            - ``keyword``: the keyword and its PoS tag
            - ``left`` & ``right``: the left and right context, 
                consisting of tokens and their PoS tags.
        """

        full_text = []
        sent_len = []
        for i, sent in enumerate(self.corp[text_id]):
            sent_len.append(len(sent))
            full_text += sent
        
        keyword_idx = sentPos2textPos(sent_len, sent_id, position)
        keyword = full_text[keyword_idx:(keyword_idx + n)]

        return {
            'keyword': keyword,
            'left': full_text[(keyword_idx - left):keyword_idx],
            'right': full_text[(keyword_idx + n):(keyword_idx + n + right)]
        }
Source code for KWIC.queryDB

KWIC backend

Navigation

Related Topics