Source code for munin.provider.normalize

#!/usr/bin/env python
# encoding: utf-8

"""
Overview
--------

Providers that know how to normalize specific input patterns like Artist names,
Album strings or Songtitles.

Reference
---------
"""


# Stdlib:
import re
import unicodedata


# Internal:
from munin.provider import Provider
import munin.stopwords

# External
import guess_language
guess_language.use_enchant(True)


def normalize_unicode_glyphs(string):
    return unicodedata.normalize('NFKC', string)


def strip_stopwords(words):
    text = ' '.join(words)
    language_code = guess_language.guess_language(text)
    if language_code == 'UNKNOWN':
        return words

    stopwords = munin.stopwords.load_stopwords(language_code)
    if not stopwords:
        return words

    return filter(lambda w: w not in stopwords, words)


[docs]class NormalizeProvider(Provider): """Very simple provider that normalizes input strings. Returns a tuple with a single normalized string in it. **Usage example:** >>> provider = NormalizeProvider() >>> provider.do_process(' aBc ') ('abc', ) Additionaly unicode glyphs are normalized with the NFKC method. """ def do_process(self, input_string): return normalize_unicode_glyphs(input_string.lower().strip())
[docs]class ArtistNormalizeProvider(Provider): """Normalize an Artist Name by normalizing common patterns. Takes a single string, outputs a iterable of subartists **Usage example:** .. code-block:: python >>> provider = ArtistNormalizeProvider() >>> provider.do_process('BertaX & Gustl') ('bertax', 'gustl') >>> provider.do_process("A Diablo's Swing Orchästra") ('diablo swing orchästra') >>> provider.do_process('a feat. b') ('a', 'b') >>> provider.do_process('The Beatles') ('beatles', ) This provider loosely follows this convention: http://labrosa.ee.columbia.edu/projects/musicsim/normalization.html """ def __init__(self, **kwargs): Provider.__init__(self, **kwargs) self._punctuation = re.compile("\W|_") self._split_reasons = frozenset(['feat', 'featuring', 'and']) self._strip_patterns = [re.compile(pattern) for pattern in [ r'^the\s*', r'^a\s*', r'\s*of\s*' ]] def do_process(self, input_string): step = [s for s in self._punctuation.split(input_string.lower()) if s] sub_artists = [] for idx, element in enumerate(step): if element in self._split_reasons: # Only handle one. sub_artists = [ ' '.join(step[:idx]), ' '.join(step[idx + 1:]) ] break else: sub_artists = [' '.join(step)] for idx, sub_artist in enumerate(sub_artists): for pattern in self._strip_patterns: sub_artists[idx] = pattern.sub('', sub_artists[idx]) return tuple(normalize_unicode_glyphs(s.strip()) for s in sub_artists)
[docs]class AlbumNormalizeProvider(Provider): """Normalize an Album name by normalizing common patterns. Takes a single string, outputs a tuple with one normalized name in it. **Usage example:** .. code-block:: python >>> provider = ArtistNormalizeProvider() >>> provider.do_process('## The Art of getting bugs (live) CD 12') ('the art of getting bugs', ) This provider loosely follows this convention: http://labrosa.ee.columbia.edu/projects/musicsim/normalization.html """ def __init__(self, **kwargs): Provider.__init__(self, **kwargs) self._punctuation = re.compile("\W|_") self._strip_patterns = [re.compile(pattern) for pattern in [ r'\s*[\(\[{].*?[}\)\]]', # Strip everything in brackets ([{ r'\s*(cd|disc)\s*\d+' # remove CD <X> stuff. ]] def do_process(self, input_string): step = input_string.lower() for pattern in self._strip_patterns: step = pattern.sub('', step) step = list(filter(None, self._punctuation.split(step))) step = list(strip_stopwords(step)) return (normalize_unicode_glyphs(' '.join(step).strip()), ) # For now they do the same:
TitleNormalizeProvider = AlbumNormalizeProvider if __name__ == '__main__': import unittest import sys if '--cli' in sys.argv: prov = TitleNormalizeProvider() print(prov.do_process(sys.argv[2])) else: class TestArtistNormalizeProvider(unittest.TestCase): def test_splitting(self): prov = ArtistNormalizeProvider() self.assertEqual( prov.do_process('The *** Hello and Berta ###'), ('hello', 'berta') ) self.assertEqual( prov.do_process('The *** Hello Berta ### featuring Gustl'), ('hello berta', 'gustl') ) class TestAlbumNormalizeProvider(unittest.TestCase): def test_splitting(self): prov = AlbumNormalizeProvider() self.assertEqual( prov.do_process('### The art of getting &bugs (live!) [liver!!] {livest!!!} CD1'), ('art bugs', ) ) unittest.main()

Related Topics

Useful links:

Package:

Github: