Source code for pulsar.utils.slugify

'''A slugify function which handle unicode

.. autofunction:: slugify

'''
import re
from unicodedata import normalize
from html.entities import name2codepoint

try:
    from unidecode import unidecode
except ImportError:
    unidecode = None

from .pep import to_string


# character entity reference
CHAR_ENTITY_REXP = re.compile('&(%s);' % '|'.join(name2codepoint))

# decimal character reference
DECIMAL_REXP = re.compile('&#(\d+);')

# hexadecimal character reference
HEX_REXP = re.compile('&#x([\da-fA-F]+);')

REPLACE1_REXP = re.compile(r'[\']+')
REPLACE2_REXP = re.compile(r'[^-a-z0-9]+')
REMOVE_REXP = re.compile('-{2,}')


[docs]def slugify(value, separator='-', max_length=0, word_boundary=False, entities=True, decimal=True, hexadecimal=True): '''Normalizes string, removes non-alpha characters, and converts spaces to ``separator`` character ''' value = normalize('NFKD', to_string(value, 'utf-8', 'ignore')) if unidecode: value = unidecode(value) # character entity reference if entities: value = CHAR_ENTITY_REXP.sub( lambda m: chr(name2codepoint[m.group(1)]), value) # decimal character reference if decimal: try: value = DECIMAL_REXP.sub(lambda m: chr(int(m.group(1))), value) except: pass # hexadecimal character reference if hexadecimal: try: value = HEX_REXP.sub(lambda m: chr(int(m.group(1), 16)), value) except: pass value = value.lower() value = REPLACE1_REXP.sub('', value) value = REPLACE2_REXP.sub('-', value) # remove redundant - value = REMOVE_REXP.sub('-', value).strip('-') # smart truncate if requested if max_length > 0: value = smart_truncate(value, max_length, word_boundary, '-') if separator != '-': value = value.replace('-', separator) return value
def smart_truncate(value, max_length=0, word_boundaries=False, separator=' '): """ Truncate a string """ value = value.strip(separator) if not max_length: return value if len(value) < max_length: return value if not word_boundaries: return value[:max_length].strip(separator) if separator not in value: return value[:max_length] truncated = '' for word in value.split(separator): if word: next_len = len(truncated) + len(word) + len(separator) if next_len <= max_length: truncated += '{0}{1}'.format(word, separator) if not truncated: truncated = value[:max_length] return truncated.strip(separator)