PlexKodiConnect/resources/lib/plex_api/fanart_lookup.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from logging import getLogger
from re import sub
from string import punctuation

from ..downloadutils import DownloadUtils as DU
from .. import utils, variables as v

LOG = getLogger('PLEX.api.fanartlookup')

API_KEY = utils.settings('themoviedbAPIKey')

# How far apart can a video's airing date be (in years)
YEARS_APART = 1
# levenshtein_distance_ratio() returns a value between 0 (no match) and 1 (full
# match). What's the threshold?
LEVENSHTEIN_RATIO_THRESHOLD = 0.95
# Which character should we ignore when matching video titles?
EXCLUDE_CHARS = set(punctuation)


def external_item_id(title, year, plex_type, collection):
    LOG.debug('Start identifying %s (%s, %s)', title, year, plex_type)
    year = int(year) if year else None
    media_type = 'tv' if plex_type == v.PLEX_TYPE_SHOW else plex_type
    # if the title has the year in remove it as tmdb cannot deal with it...
    # replace e.g. 'The Americans (2015)' with 'The Americans'
    title = sub(r'\s*\(\d{4}\)$', '', title, count=1)
    url = 'https://api.themoviedb.org/3/search/%s' % media_type
    parameters = {
        'api_key': API_KEY,
        'language': v.KODILANGUAGE,
        'query': title.encode('utf-8')
    }
    data = DU().downloadUrl(url,
                            authenticate=False,
                            parameters=parameters,
                            timeout=7)
    try:
        data = data['results']
    except (AttributeError, KeyError, TypeError):
        LOG.debug('No match found on themoviedb for %s (%s, %s)',
                  title, year, media_type)
        return
    LOG.debug('themoviedb returned results: %s', data)
    # Some entries don't contain a title or id - get rid of them
    data = [x for x in data if 'title' in x and 'id' in x]
    # Get rid of all results that do NOT have a matching release year
    if year:
        data = [x for x in data if __year_almost_matches(year, x)]
    if not data:
        LOG.debug('Empty results returned by themoviedb for %s (%s, %s)',
                  title, year, media_type)
        return
    # Calculate how similar the titles are
    title = sanitize_string(title)
    for entry in data:
        entry['match_score'] = levenshtein_distance_ratio(
            sanitize_string(entry['title']), title)
    # (one of the possibly many) best match using levenshtein distance ratio
    entry = max(data, key=lambda x: x['match_score'])
    if entry['match_score'] < LEVENSHTEIN_RATIO_THRESHOLD:
        LOG.debug('Best themoviedb match not good enough: %s', entry)
        return

    # Check if we got several matches. If so, take the most popular one
    best_matches = [x for x in data if
                    x['match_score'] == entry['match_score'] 
                    and 'popularity' in x]
    entry = max(best_matches, key=lambda x: x['popularity'])
    LOG.debug('Found themoviedb match: %s', entry)

    # lookup external tmdb_id and perform artwork lookup on fanart.tv
    tmdb_id = entry.get('id')
    parameters = {'api_key': API_KEY}
    if media_type == 'movie':
        url = 'https://api.themoviedb.org/3/movie/%s' % tmdb_id
        parameters['append_to_response'] = 'videos'
    elif media_type == 'tv':
        url = 'https://api.themoviedb.org/3/tv/%s' % tmdb_id
        parameters['append_to_response'] = 'external_ids,videos'
    media_id, poster, background = None, None, None
    for language in (v.KODILANGUAGE, 'en'):
        parameters['language'] = language
        data = DU().downloadUrl(url,
                                authenticate=False,
                                parameters=parameters,
                                timeout=7)
        try:
            data.get('test')
        except AttributeError:
            LOG.warning('Could not download %s with parameters %s',
                        url, parameters)
            continue
        if collection is False:
            if data.get('imdb_id'):
                media_id = str(data.get('imdb_id'))
                break
            if (data.get('external_ids') and
                    data['external_ids'].get('tvdb_id')):
                media_id = str(data['external_ids']['tvdb_id'])
                break
        else:
            if not data.get('belongs_to_collection'):
                continue
            media_id = data.get('belongs_to_collection').get('id')
            if not media_id:
                continue
            media_id = str(media_id)
            LOG.debug('Retrieved collections tmdb id %s for %s',
                      media_id, title)
            url = 'https://api.themoviedb.org/3/collection/%s' % media_id
            data = DU().downloadUrl(url,
                                    authenticate=False,
                                    parameters=parameters,
                                    timeout=7)
            try:
                data.get('poster_path')
            except AttributeError:
                LOG.debug('Could not find TheMovieDB poster paths for %s'
                          ' in the language %s', title, language)
                continue
            if not poster and data.get('poster_path'):
                poster = ('https://image.tmdb.org/t/p/original%s' %
                          data.get('poster_path'))
            if not background and data.get('backdrop_path'):
                background = ('https://image.tmdb.org/t/p/original%s' %
                              data.get('backdrop_path'))
    return media_id, poster, background


def __year_almost_matches(year, entry):
    try:
        entry_year = int(entry['release_date'][0:4])
    except (KeyError, ValueError):
        return True
    return abs(year - entry_year) <= YEARS_APART


def sanitize_string(s):
    s = s.lower().strip()
    # Get rid of chars in EXCLUDE_CHARS
    s = ''.join(character for character in s if character not in EXCLUDE_CHARS)
    # Get rid of multiple spaces
    s = ' '.join(s.split())
    return s


def levenshtein_distance_ratio(s, t):
    """
    Calculates levenshtein distance ratio between two strings.
    The more similar the strings, the closer the result will be to 1.
    The farther disjunct the string, the closer the result to 0

    https://www.datacamp.com/community/tutorials/fuzzy-string-python
    """
    # Initialize matrix of zeros
    rows = len(s) + 1
    cols = len(t) + 1
    distance = [[0 for x in range(cols)] for y in range(rows)] 

    # Populate matrix of zeros with the indeces of each character of both strings
    for i in range(1, rows):
        for k in range(1,cols):
            distance[i][0] = i
            distance[0][k] = k

    # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions    
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
            else:
                # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
                # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
                cost = 2
            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of deletions
                                 distance[row][col-1] + 1,          # Cost of insertions
                                 distance[row-1][col-1] + cost)     # Cost of substitutions
    return ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
Greatly improve matching logic for tmdb if Plex does not provide id 2020-12-19 01:35:20 +11:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`
			`from logging import getLogger`
			`from re import sub`
			`from string import punctuation`

			`from ..downloadutils import DownloadUtils as DU`
			`from .. import utils, variables as v`

			`LOG = getLogger('PLEX.api.fanartlookup')`

			`API_KEY = utils.settings('themoviedbAPIKey')`

			`# How far apart can a video's airing date be (in years)`
			`YEARS_APART = 1`
			`# levenshtein_distance_ratio() returns a value between 0 (no match) and 1 (full`
			`# match). What's the threshold?`
			`LEVENSHTEIN_RATIO_THRESHOLD = 0.95`
			`# Which character should we ignore when matching video titles?`
			`EXCLUDE_CHARS = set(punctuation)`


			`def external_item_id(title, year, plex_type, collection):`
			`LOG.debug('Start identifying %s (%s, %s)', title, year, plex_type)`
			`year = int(year) if year else None`
			`media_type = 'tv' if plex_type == v.PLEX_TYPE_SHOW else plex_type`
			`# if the title has the year in remove it as tmdb cannot deal with it...`
			`# replace e.g. 'The Americans (2015)' with 'The Americans'`
			`title = sub(r'\s*\(\d{4}\)$', '', title, count=1)`
			`url = 'https://api.themoviedb.org/3/search/%s' % media_type`
			`parameters = {`
			`'api_key': API_KEY,`
			`'language': v.KODILANGUAGE,`
			`'query': title.encode('utf-8')`
			`}`
			`data = DU().downloadUrl(url,`
			`authenticate=False,`
			`parameters=parameters,`
			`timeout=7)`
			`try:`
			`data = data['results']`
			`except (AttributeError, KeyError, TypeError):`
			`LOG.debug('No match found on themoviedb for %s (%s, %s)',`
			`title, year, media_type)`
			`return`
			`LOG.debug('themoviedb returned results: %s', data)`
			`# Some entries don't contain a title or id - get rid of them`
			`data = [x for x in data if 'title' in x and 'id' in x]`
			`# Get rid of all results that do NOT have a matching release year`
			`if year:`
			`data = [x for x in data if __year_almost_matches(year, x)]`
			`if not data:`
			`LOG.debug('Empty results returned by themoviedb for %s (%s, %s)',`
			`title, year, media_type)`
			`return`
			`# Calculate how similar the titles are`
			`title = sanitize_string(title)`
			`for entry in data:`
			`entry['match_score'] = levenshtein_distance_ratio(`
			`sanitize_string(entry['title']), title)`
			`# (one of the possibly many) best match using levenshtein distance ratio`
			`entry = max(data, key=lambda x: x['match_score'])`
			`if entry['match_score'] < LEVENSHTEIN_RATIO_THRESHOLD:`
			`LOG.debug('Best themoviedb match not good enough: %s', entry)`
			`return`

			`# Check if we got several matches. If so, take the most popular one`
			`best_matches = [x for x in data if`
			`x['match_score'] == entry['match_score']`
			`and 'popularity' in x]`
			`entry = max(best_matches, key=lambda x: x['popularity'])`
			`LOG.debug('Found themoviedb match: %s', entry)`

			`# lookup external tmdb_id and perform artwork lookup on fanart.tv`
			`tmdb_id = entry.get('id')`
			`parameters = {'api_key': API_KEY}`
			`if media_type == 'movie':`
			`url = 'https://api.themoviedb.org/3/movie/%s' % tmdb_id`
			`parameters['append_to_response'] = 'videos'`
			`elif media_type == 'tv':`
			`url = 'https://api.themoviedb.org/3/tv/%s' % tmdb_id`
			`parameters['append_to_response'] = 'external_ids,videos'`
			`media_id, poster, background = None, None, None`
			`for language in (v.KODILANGUAGE, 'en'):`
			`parameters['language'] = language`
			`data = DU().downloadUrl(url,`
			`authenticate=False,`
			`parameters=parameters,`
			`timeout=7)`
			`try:`
			`data.get('test')`
			`except AttributeError:`
			`LOG.warning('Could not download %s with parameters %s',`
			`url, parameters)`
			`continue`
			`if collection is False:`
			`if data.get('imdb_id'):`
			`media_id = str(data.get('imdb_id'))`
			`break`
			`if (data.get('external_ids') and`
			`data['external_ids'].get('tvdb_id')):`
			`media_id = str(data['external_ids']['tvdb_id'])`
			`break`
			`else:`
			`if not data.get('belongs_to_collection'):`
			`continue`
			`media_id = data.get('belongs_to_collection').get('id')`
			`if not media_id:`
			`continue`
			`media_id = str(media_id)`
			`LOG.debug('Retrieved collections tmdb id %s for %s',`
			`media_id, title)`
			`url = 'https://api.themoviedb.org/3/collection/%s' % media_id`
			`data = DU().downloadUrl(url,`
			`authenticate=False,`
			`parameters=parameters,`
			`timeout=7)`
			`try:`
			`data.get('poster_path')`
			`except AttributeError:`
			`LOG.debug('Could not find TheMovieDB poster paths for %s'`
			`' in the language %s', title, language)`
			`continue`
			`if not poster and data.get('poster_path'):`
			`poster = ('https://image.tmdb.org/t/p/original%s' %`
			`data.get('poster_path'))`
			`if not background and data.get('backdrop_path'):`
			`background = ('https://image.tmdb.org/t/p/original%s' %`
			`data.get('backdrop_path'))`
			`return media_id, poster, background`


			`def __year_almost_matches(year, entry):`
			`try:`
			`entry_year = int(entry['release_date'][0:4])`
			`except (KeyError, ValueError):`
			`return True`
			`return abs(year - entry_year) <= YEARS_APART`


			`def sanitize_string(s):`
			`s = s.lower().strip()`
			`# Get rid of chars in EXCLUDE_CHARS`
			`s = ''.join(character for character in s if character not in EXCLUDE_CHARS)`
			`# Get rid of multiple spaces`
			`s = ' '.join(s.split())`
			`return s`


			`def levenshtein_distance_ratio(s, t):`
			`"""`
			`Calculates levenshtein distance ratio between two strings.`
			`The more similar the strings, the closer the result will be to 1.`
			`The farther disjunct the string, the closer the result to 0`

			`https://www.datacamp.com/community/tutorials/fuzzy-string-python`
			`"""`
			`# Initialize matrix of zeros`
			`rows = len(s) + 1`
			`cols = len(t) + 1`
			`distance = [[0 for x in range(cols)] for y in range(rows)]`

			`# Populate matrix of zeros with the indeces of each character of both strings`
			`for i in range(1, rows):`
			`for k in range(1,cols):`
			`distance[i][0] = i`
			`distance[0][k] = k`

			`# Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions`
			`for col in range(1, cols):`
			`for row in range(1, rows):`
			`if s[row-1] == t[col-1]:`
			`cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0`
			`else:`
			`# In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio`
			`# the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.`
			`cost = 2`
			`distance[row][col] = min(distance[row-1][col] + 1, # Cost of deletions`
			`distance[row][col-1] + 1, # Cost of insertions`
			`distance[row-1][col-1] + cost) # Cost of substitutions`
			`return ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))`