Source code for chatterbot.preprocessors
"""
Statement pre-processors.
"""
from unicodedata import normalize
from re import sub as re_sub
from html import unescape
[docs]
def clean_whitespace(statement):
"""
Remove any consecutive whitespace characters from the statement text.
"""
# Replace linebreaks and tabs with spaces
# Uses splitlines() which includes a superset of universal newlines:
# https://docs.python.org/3/library/stdtypes.html#str.splitlines
statement.text = ' '.join(statement.text.splitlines()).replace('\t', ' ')
# Remove any leading or trailing whitespace
statement.text = statement.text.strip()
# Remove consecutive spaces
statement.text = re_sub(' +', ' ', statement.text)
return statement
[docs]
def unescape_html(statement):
"""
Convert escaped html characters into unescaped html characters.
For example: "<b>" becomes "<b>".
"""
statement.text = unescape(statement.text)
return statement
[docs]
def convert_to_ascii(statement):
"""
Converts unicode characters to ASCII character equivalents.
For example: "på fédéral" becomes "pa federal".
"""
text = normalize('NFKD', statement.text)
text = text.encode('ascii', 'ignore').decode('utf-8')
statement.text = str(text)
return statement