Source code for chatterbot.tagging

from typing import List, Union, Tuple
from chatterbot import languages
from chatterbot.utils import get_model_for_language
import spacy


[docs] class LowercaseTagger(object): """ Returns the text in lowercase. """ def __init__(self, language=None): from chatterbot.components import chatterbot_lowercase_indexer # noqa self.language = language or languages.ENG # Create a new empty spacy nlp object self.nlp = spacy.blank(self.language.ISO_639_1) self.nlp.add_pipe( 'chatterbot_lowercase_indexer', name='chatterbot_lowercase_indexer', last=True )
[docs] def get_text_index_string(self, text: Union[str, List[str]]): if isinstance(text, list): documents = self.nlp.pipe(text, batch_size=1000, n_process=1) return [document._.search_index for document in documents] else: document = self.nlp(text) return document._.search_index
[docs] def as_nlp_pipeline( self, texts: Union[List[str], Tuple[str, dict]], batch_size: int = 1000, n_process: int = 1 ): """ Process texts through the spaCy NLP pipeline with optimized batching. :param texts: Text strings or tuples of (text, context_dict) :param batch_size: Number of texts per batch (default 1000) :param n_process: Number of worker processes for spaCy's pipe (set >1 to use multiprocessing) Usage: documents = tagger.as_nlp_pipeline(texts) documents = tagger.as_nlp_pipeline(texts, batch_size=2000, n_process=4) """ process_as_tuples = texts and isinstance(texts[0], tuple) documents = self.nlp.pipe( texts, as_tuples=process_as_tuples, batch_size=batch_size, n_process=n_process ) return documents
[docs] class PosLemmaTagger(object): def __init__(self, language=None): from chatterbot.components import chatterbot_bigram_indexer # noqa self.language = language or languages.ENG model = get_model_for_language(self.language) # Disable the Named Entity Recognition (NER) component because it is not necessary self.nlp = spacy.load(model, exclude=['ner']) self.nlp.add_pipe( 'chatterbot_bigram_indexer', name='chatterbot_bigram_indexer', last=True )
[docs] def get_text_index_string(self, text: Union[str, List[str]]) -> str: """ Return a string of text containing part-of-speech, lemma pairs. """ if isinstance(text, list): documents = self.nlp.pipe(text, batch_size=1000, n_process=1) return [document._.search_index for document in documents] else: document = self.nlp(text) return document._.search_index
[docs] def as_nlp_pipeline( self, texts: Union[List[str], Tuple[str, dict]], batch_size: int = 1000, n_process: int = 1 ) -> spacy.tokens.Doc: """ Accepts a single string or a list of strings, or a list of tuples where the first element is the text and the second element is a dictionary of context to return alongside the generated document. :param texts: Text strings or tuples of (text, context_dict) :param batch_size: Number of texts per batch (default 1000) :param n_process: Number of worker processes for spaCy's pipe (set >1 to use multiprocessing) Usage: documents = tagger.as_nlp_pipeline(texts) documents = tagger.as_nlp_pipeline(texts, batch_size=2000, n_process=4) """ process_as_tuples = texts and isinstance(texts[0], tuple) documents = self.nlp.pipe( texts, as_tuples=process_as_tuples, batch_size=batch_size, n_process=n_process ) return documents