Source code for pulsar.utils.slugify

'''A slugify function which handle unicode

.. autofunction:: slugify

'''
import re
from unicodedata import normalize
from html.entities import name2codepoint

try:
    from unidecode import unidecode
except ImportError:
    unidecode = None

from .pep import to_string


# character entity reference
CHAR_ENTITY_REXP = re.compile('&(%s);' % '|'.join(name2codepoint))

# decimal character reference
DECIMAL_REXP = re.compile('&#(\d+);')

# hexadecimal character reference
HEX_REXP = re.compile('&#x([\da-fA-F]+);')

REPLACE1_REXP = re.compile(r'[\']+')
REPLACE2_REXP = re.compile(r'[^-a-z0-9]+')
REMOVE_REXP = re.compile('-{2,}')


[docs]def slugify(value, separator='-', max_length=0, word_boundary=False,
            entities=True, decimal=True, hexadecimal=True):
    '''Normalizes string, removes non-alpha characters,
    and converts spaces to ``separator`` character
    '''
    value = normalize('NFKD', to_string(value, 'utf-8', 'ignore'))
    if unidecode:
        value = unidecode(value)

    # character entity reference
    if entities:
        value = CHAR_ENTITY_REXP.sub(
            lambda m: chr(name2codepoint[m.group(1)]), value)

    # decimal character reference
    if decimal:
        try:
            value = DECIMAL_REXP.sub(lambda m: chr(int(m.group(1))), value)
        except:
            pass

    # hexadecimal character reference
    if hexadecimal:
        try:
            value = HEX_REXP.sub(lambda m: chr(int(m.group(1), 16)), value)
        except:
            pass

    value = value.lower()

    value = REPLACE1_REXP.sub('', value)
    value = REPLACE2_REXP.sub('-', value)

    # remove redundant -
    value = REMOVE_REXP.sub('-', value).strip('-')

    # smart truncate if requested
    if max_length > 0:
        value = smart_truncate(value, max_length, word_boundary, '-')

    if separator != '-':
        value = value.replace('-', separator)

    return value


def smart_truncate(value, max_length=0, word_boundaries=False, separator=' '):
    """ Truncate a string """

    value = value.strip(separator)

    if not max_length:
        return value

    if len(value) < max_length:
        return value

    if not word_boundaries:
        return value[:max_length].strip(separator)

    if separator not in value:
        return value[:max_length]

    truncated = ''
    for word in value.split(separator):
        if word:
            next_len = len(truncated) + len(word) + len(separator)
            if next_len <= max_length:
                truncated += '{0}{1}'.format(word, separator)
    if not truncated:
        truncated = value[:max_length]
    return truncated.strip(separator)