PlexKodiConnect/resources/lib/plex_api/fanart_lookup.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, unicode_literals
from builtins import str
from builtins import range
from logging import getLogger
from re import sub
from string import punctuation

from ..downloadutils import DownloadUtils as DU
from .. import utils, variables as v

LOG = getLogger('PLEX.api.fanartlookup')

API_KEY = utils.settings('themoviedbAPIKey')

# How far apart can a video's airing date be (in years)
YEARS_APART = 1
# levenshtein_distance_ratio() returns a value between 0 (no match) and 1 (full
# match). What's the threshold?
LEVENSHTEIN_RATIO_THRESHOLD = 0.95
# Which character should we ignore when matching video titles?
EXCLUDE_CHARS = set(punctuation)


def external_item_id(title, year, plex_type, collection):
    LOG.debug('Start identifying %s (%s, %s)', title, year, plex_type)
    year = int(year) if year else None
    media_type = 'tv' if plex_type == v.PLEX_TYPE_SHOW else plex_type
    # if the title has the year in remove it as tmdb cannot deal with it...
    # replace e.g. 'The Americans (2015)' with 'The Americans'
    title = sub(r'\s*\(\d{4}\)$', '', title, count=1)
    url = 'https://api.themoviedb.org/3/search/%s' % media_type
    parameters = {
        'api_key': API_KEY,
        'language': v.KODILANGUAGE,
        'query': title.encode('utf-8')
    }
    data = DU().downloadUrl(url,
                            authenticate=False,
                            parameters=parameters,
                            timeout=7)
    try:
        data = data['results']
    except (AttributeError, KeyError, TypeError):
        LOG.debug('No match found on themoviedb for %s (%s, %s)',
                  title, year, media_type)
        return
    LOG.debug('themoviedb returned results: %s', data)
    # Some entries don't contain a title or id - get rid of them
    data = [x for x in data if 'title' in x and 'id' in x]
    # Get rid of all results that do NOT have a matching release year
    if year:
        data = [x for x in data if __year_almost_matches(year, x)]
    if not data:
        LOG.debug('Empty results returned by themoviedb for %s (%s, %s)',
                  title, year, media_type)
        return
    # Calculate how similar the titles are
    title = sanitize_string(title)
    for entry in data:
        entry['match_score'] = levenshtein_distance_ratio(
            sanitize_string(entry['title']), title)
    # (one of the possibly many) best match using levenshtein distance ratio
    entry = max(data, key=lambda x: x['match_score'])
    if entry['match_score'] < LEVENSHTEIN_RATIO_THRESHOLD:
        LOG.debug('Best themoviedb match not good enough: %s', entry)
        return

    # Check if we got several matches. If so, take the most popular one
    best_matches = [x for x in data if
                    x['match_score'] == entry['match_score']
                    and 'popularity' in x]
    entry = max(best_matches, key=lambda x: x['popularity'])
    LOG.debug('Found themoviedb match: %s', entry)

    # lookup external tmdb_id and perform artwork lookup on fanart.tv
    tmdb_id = entry.get('id')
    parameters = {'api_key': API_KEY}
    if media_type == 'movie':
        url = 'https://api.themoviedb.org/3/movie/%s' % tmdb_id
        parameters['append_to_response'] = 'videos'
    elif media_type == 'tv':
        url = 'https://api.themoviedb.org/3/tv/%s' % tmdb_id
        parameters['append_to_response'] = 'external_ids,videos'
    media_id, poster, background = None, None, None
    for language in (v.KODILANGUAGE, 'en'):
        parameters['language'] = language
        data = DU().downloadUrl(url,
                                authenticate=False,
                                parameters=parameters,
                                timeout=7)
        try:
            data.get('test')
        except AttributeError:
            LOG.warning('Could not download %s with parameters %s',
                        url, parameters)
            continue
        if collection is False:
            if data.get('imdb_id'):
                media_id = str(data.get('imdb_id'))
                break
            if (data.get('external_ids') and
                    data['external_ids'].get('tvdb_id')):
                media_id = str(data['external_ids']['tvdb_id'])
                break
        else:
            if not data.get('belongs_to_collection'):
                continue
            media_id = data.get('belongs_to_collection').get('id')
            if not media_id:
                continue
            media_id = str(media_id)
            LOG.debug('Retrieved collections tmdb id %s for %s',
                      media_id, title)
            url = 'https://api.themoviedb.org/3/collection/%s' % media_id
            data = DU().downloadUrl(url,
                                    authenticate=False,
                                    parameters=parameters,
                                    timeout=7)
            try:
                data.get('poster_path')
            except AttributeError:
                LOG.debug('Could not find TheMovieDB poster paths for %s'
                          ' in the language %s', title, language)
                continue
            if not poster and data.get('poster_path'):
                poster = ('https://image.tmdb.org/t/p/original%s' %
                          data.get('poster_path'))
            if not background and data.get('backdrop_path'):
                background = ('https://image.tmdb.org/t/p/original%s' %
                              data.get('backdrop_path'))
    return media_id, poster, background


def __year_almost_matches(year, entry):
    try:
        entry_year = int(entry['release_date'][0:4])
    except (KeyError, ValueError):
        return True
    return abs(year - entry_year) <= YEARS_APART


def sanitize_string(s):
    s = s.lower().strip()
    # Get rid of chars in EXCLUDE_CHARS
    s = ''.join(character for character in s if character not in EXCLUDE_CHARS)
    # Get rid of multiple spaces
    s = ' '.join(s.split())
    return s


def levenshtein_distance_ratio(s, t):
    """
    Calculates levenshtein distance ratio between two strings.
    The more similar the strings, the closer the result will be to 1.
    The farther disjunct the string, the closer the result to 0

    https://www.datacamp.com/community/tutorials/fuzzy-string-python
    """
    # Initialize matrix of zeros
    rows = len(s) + 1
    cols = len(t) + 1
    distance = [[0 for x in range(cols)] for y in range(rows)]

    # Populate matrix of zeros with the indeces of each character of both strings
    for i in range(1, rows):
        for k in range(1,cols):
            distance[i][0] = i
            distance[0][k] = k

    # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
            else:
                # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
                # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
                cost = 2
            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of deletions
                                 distance[row][col-1] + 1,          # Cost of insertions
                                 distance[row-1][col-1] + cost)     # Cost of substitutions
    return ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))