Source code for munin.provider.normalize

#!/usr/bin/env python
# encoding: utf-8

"""
Overview
--------

Providers that know how to normalize specific input patterns like Artist names,
Album strings or Songtitles.

Reference
---------
"""


# Stdlib:
import re
import unicodedata


# Internal:
from munin.provider import Provider
import munin.stopwords

# External
import guess_language
guess_language.use_enchant(True)


def normalize_unicode_glyphs(string):
    return unicodedata.normalize('NFKC', string)


def strip_stopwords(words):
    text = ' '.join(words)
    language_code = guess_language.guess_language(text)
    if language_code == 'UNKNOWN':
        return words

    stopwords = munin.stopwords.load_stopwords(language_code)
    if not stopwords:
        return words

    return filter(lambda w: w not in stopwords, words)


[docs]class NormalizeProvider(Provider):
    """Very simple provider that normalizes input strings.

    Returns a tuple with a single normalized string in it.

    **Usage example:**

        >>> provider = NormalizeProvider()
        >>> provider.do_process('    aBc  ')
        ('abc', )

    Additionaly unicode glyphs are normalized with the NFKC method.
    """
    def do_process(self, input_string):
        return normalize_unicode_glyphs(input_string.lower().strip())


[docs]class ArtistNormalizeProvider(Provider):
    """Normalize an Artist Name by normalizing common patterns.

    Takes a single string, outputs a iterable of subartists

    **Usage example:**

    .. code-block:: python

        >>> provider = ArtistNormalizeProvider()
        >>> provider.do_process('BertaX & Gustl')
        ('bertax', 'gustl')
        >>> provider.do_process("A Diablo's Swing Orchästra")
        ('diablo swing orchästra')
        >>> provider.do_process('a feat. b')
        ('a', 'b')
        >>> provider.do_process('The Beatles')
        ('beatles', )

    This provider loosely follows this convention:

        http://labrosa.ee.columbia.edu/projects/musicsim/normalization.html
    """
    def __init__(self, **kwargs):
        Provider.__init__(self, **kwargs)
        self._punctuation = re.compile("\W|_")
        self._split_reasons = frozenset(['feat', 'featuring', 'and'])
        self._strip_patterns = [re.compile(pattern) for pattern in [
            r'^the\s*', r'^a\s*', r'\s*of\s*'
        ]]

    def do_process(self, input_string):
        step = [s for s in self._punctuation.split(input_string.lower()) if s]

        sub_artists = []
        for idx, element in enumerate(step):
            if element in self._split_reasons:
                # Only handle one.
                sub_artists = [
                    ' '.join(step[:idx]),
                    ' '.join(step[idx + 1:])
                ]
                break
        else:
            sub_artists = [' '.join(step)]

        for idx, sub_artist in enumerate(sub_artists):
            for pattern in self._strip_patterns:
                sub_artists[idx] = pattern.sub('', sub_artists[idx])

        return tuple(normalize_unicode_glyphs(s.strip()) for s in sub_artists)


[docs]class AlbumNormalizeProvider(Provider):
    """Normalize an Album name by normalizing common patterns.

    Takes a single string, outputs a tuple with one normalized name in it.

    **Usage example:**

    .. code-block:: python

        >>> provider = ArtistNormalizeProvider()
        >>> provider.do_process('## The Art of getting bugs (live) CD 12')
        ('the art of getting bugs', )

    This provider loosely follows this convention:

        http://labrosa.ee.columbia.edu/projects/musicsim/normalization.html
    """
    def __init__(self, **kwargs):
        Provider.__init__(self, **kwargs)
        self._punctuation = re.compile("\W|_")
        self._strip_patterns = [re.compile(pattern) for pattern in [
            r'\s*[\(\[{].*?[}\)\]]',  # Strip everything in brackets ([{
            r'\s*(cd|disc)\s*\d+'     # remove CD <X> stuff.
        ]]

    def do_process(self, input_string):
        step = input_string.lower()
        for pattern in self._strip_patterns:
            step = pattern.sub('', step)

        step = list(filter(None, self._punctuation.split(step)))
        step = list(strip_stopwords(step))
        return (normalize_unicode_glyphs(' '.join(step).strip()), )


# For now they do the same:
TitleNormalizeProvider = AlbumNormalizeProvider


if __name__ == '__main__':
    import unittest
    import sys

    if '--cli' in sys.argv:
        prov = TitleNormalizeProvider()
        print(prov.do_process(sys.argv[2]))
    else:
        class TestArtistNormalizeProvider(unittest.TestCase):
            def test_splitting(self):
                prov = ArtistNormalizeProvider()
                self.assertEqual(
                    prov.do_process('The *** Hello and Berta ###'),
                    ('hello', 'berta')
                )
                self.assertEqual(
                    prov.do_process('The *** Hello Berta ### featuring Gustl'),
                    ('hello berta', 'gustl')
                )

        class TestAlbumNormalizeProvider(unittest.TestCase):
            def test_splitting(self):
                prov = AlbumNormalizeProvider()
                self.assertEqual(
                    prov.do_process('### The art of getting &bugs (live!) [liver!!] {livest!!!} CD1'),
                    ('art bugs', )
                )

        unittest.main()
Source code for munin.provider.normalize

Related Topics

Useful links:

Navigation

Source code for munin.provider.normalize

Related Topics

Useful links:

Quick search