From c753d97d3f89bef71b5d33a87384cb642b71228d Mon Sep 17 00:00:00 2001 From: croneter Date: Fri, 18 Dec 2020 15:35:20 +0100 Subject: [PATCH] Greatly improve matching logic for tmdb if Plex does not provide id --- resources/lib/plex_api/artwork.py | 155 +------------------- resources/lib/plex_api/fanart_lookup.py | 182 ++++++++++++++++++++++++ 2 files changed, 188 insertions(+), 149 deletions(-) create mode 100644 resources/lib/plex_api/fanart_lookup.py diff --git a/resources/lib/plex_api/artwork.py b/resources/lib/plex_api/artwork.py index a44be6f3..632d4355 100644 --- a/resources/lib/plex_api/artwork.py +++ b/resources/lib/plex_api/artwork.py @@ -2,12 +2,13 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, division, unicode_literals from logging import getLogger -from re import sub from ..kodi_db import KodiVideoDB, KodiMusicDB from ..downloadutils import DownloadUtils as DU from .. import utils, variables as v, app +from . import fanart_lookup + LOG = getLogger('PLEX.api') @@ -196,154 +197,10 @@ class Artwork(object): else: LOG.debug('Start movie set/collection lookup on themoviedb with %s', item.get('title', '')) - - api_key = utils.settings('themoviedbAPIKey') - if media_type == v.PLEX_TYPE_SHOW: - media_type = 'tv' - title = self.title() - # if the title has the year in remove it as tmdb cannot deal with it... - # replace e.g. 'The Americans (2015)' with 'The Americans' - title = sub(r'\s*\(\d{4}\)$', '', title, count=1) - url = 'https://api.themoviedb.org/3/search/%s' % media_type - parameters = { - 'api_key': api_key, - 'language': v.KODILANGUAGE, - 'query': title.encode('utf-8') - } - data = DU().downloadUrl(url, - authenticate=False, - parameters=parameters, - timeout=7) - try: - data.get('test') - except AttributeError: - LOG.warning('Could not download data from FanartTV') - return - if not data.get('results'): - LOG.info('No match found on themoviedb for type: %s, title: %s', - media_type, title) - return - - year = item.get('year') - match_found = None - # find year match - if year: - for entry in data['results']: - if year in entry.get('first_air_date', ''): - match_found = entry - break - elif year in entry.get('release_date', ''): - match_found = entry - break - # find exact match based on title, if we haven't found a year match - if match_found is None: - LOG.info('No themoviedb match found using year %s', year) - replacements = ( - ' ', - '-', - '&', - ',', - ':', - ';' - ) - for entry in data['results']: - name = entry.get('name', entry.get('title', '')) - original_name = entry.get('original_name', '') - title_alt = title.lower() - name_alt = name.lower() - org_name_alt = original_name.lower() - for replace_string in replacements: - title_alt = title_alt.replace(replace_string, '') - name_alt = name_alt.replace(replace_string, '') - org_name_alt = org_name_alt.replace(replace_string, '') - if name == title or original_name == title: - # match found for exact title name - match_found = entry - break - elif (name.split(' (')[0] == title or title_alt == name_alt or - title_alt == org_name_alt): - # match found with substituting some stuff - match_found = entry - break - - # if a match was not found, we accept the closest match from TMDB - if match_found is None and data.get('results'): - LOG.info('Using very first match from themoviedb') - match_found = entry = data.get('results')[0] - - if match_found is None: - LOG.info('Still no themoviedb match for type: %s, title: %s, ' - 'year: %s', media_type, title, year) - LOG.debug('themoviedb answer was %s', data['results']) - return - - LOG.info('Found themoviedb match for %s: %s', - item.get('title'), match_found) - - tmdb_id = str(entry.get('id', '')) - if tmdb_id == '': - LOG.error('No themoviedb ID found, aborting') - return - - if media_type == 'multi' and entry.get('media_type'): - media_type = entry.get('media_type') - name = entry.get('name', entry.get('title')) - # lookup external tmdb_id and perform artwork lookup on fanart.tv - parameters = {'api_key': api_key} - if media_type == 'movie': - url = 'https://api.themoviedb.org/3/movie/%s' % tmdb_id - parameters['append_to_response'] = 'videos' - elif media_type == 'tv': - url = 'https://api.themoviedb.org/3/tv/%s' % tmdb_id - parameters['append_to_response'] = 'external_ids,videos' - media_id, poster, background = None, None, None - for language in [v.KODILANGUAGE, 'en']: - parameters['language'] = language - data = DU().downloadUrl(url, - authenticate=False, - parameters=parameters, - timeout=7) - try: - data.get('test') - except AttributeError: - LOG.warning('Could not download %s with parameters %s', - url, parameters) - continue - if collection is False: - if data.get('imdb_id'): - media_id = str(data.get('imdb_id')) - break - if (data.get('external_ids') and - data['external_ids'].get('tvdb_id')): - media_id = str(data['external_ids']['tvdb_id']) - break - else: - if not data.get('belongs_to_collection'): - continue - media_id = data.get('belongs_to_collection').get('id') - if not media_id: - continue - media_id = str(media_id) - LOG.debug('Retrieved collections tmdb id %s for %s', - media_id, title) - url = 'https://api.themoviedb.org/3/collection/%s' % media_id - data = DU().downloadUrl(url, - authenticate=False, - parameters=parameters, - timeout=7) - try: - data.get('poster_path') - except AttributeError: - LOG.debug('Could not find TheMovieDB poster paths for %s' - ' in the language %s', title, language) - continue - if not poster and data.get('poster_path'): - poster = ('https://image.tmdb.org/t/p/original%s' % - data.get('poster_path')) - if not background and data.get('backdrop_path'): - background = ('https://image.tmdb.org/t/p/original%s' % - data.get('backdrop_path')) - return media_id, poster, background + return fanart_lookup.external_item_id(self.title(), + self.year(), + self.plex_type, + collection) def lookup_fanart_tv(self, media_id, artworks): """ diff --git a/resources/lib/plex_api/fanart_lookup.py b/resources/lib/plex_api/fanart_lookup.py new file mode 100644 index 00000000..863a7437 --- /dev/null +++ b/resources/lib/plex_api/fanart_lookup.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from __future__ import absolute_import, division, unicode_literals +from logging import getLogger +from re import sub +from string import punctuation + +from ..downloadutils import DownloadUtils as DU +from .. import utils, variables as v + +LOG = getLogger('PLEX.api.fanartlookup') + +API_KEY = utils.settings('themoviedbAPIKey') + +# How far apart can a video's airing date be (in years) +YEARS_APART = 1 +# levenshtein_distance_ratio() returns a value between 0 (no match) and 1 (full +# match). What's the threshold? +LEVENSHTEIN_RATIO_THRESHOLD = 0.95 +# Which character should we ignore when matching video titles? +EXCLUDE_CHARS = set(punctuation) + + +def external_item_id(title, year, plex_type, collection): + LOG.debug('Start identifying %s (%s, %s)', title, year, plex_type) + year = int(year) if year else None + media_type = 'tv' if plex_type == v.PLEX_TYPE_SHOW else plex_type + # if the title has the year in remove it as tmdb cannot deal with it... + # replace e.g. 'The Americans (2015)' with 'The Americans' + title = sub(r'\s*\(\d{4}\)$', '', title, count=1) + url = 'https://api.themoviedb.org/3/search/%s' % media_type + parameters = { + 'api_key': API_KEY, + 'language': v.KODILANGUAGE, + 'query': title.encode('utf-8') + } + data = DU().downloadUrl(url, + authenticate=False, + parameters=parameters, + timeout=7) + try: + data = data['results'] + except (AttributeError, KeyError, TypeError): + LOG.debug('No match found on themoviedb for %s (%s, %s)', + title, year, media_type) + return + LOG.debug('themoviedb returned results: %s', data) + # Some entries don't contain a title or id - get rid of them + data = [x for x in data if 'title' in x and 'id' in x] + # Get rid of all results that do NOT have a matching release year + if year: + data = [x for x in data if __year_almost_matches(year, x)] + if not data: + LOG.debug('Empty results returned by themoviedb for %s (%s, %s)', + title, year, media_type) + return + # Calculate how similar the titles are + title = sanitize_string(title) + for entry in data: + entry['match_score'] = levenshtein_distance_ratio( + sanitize_string(entry['title']), title) + # (one of the possibly many) best match using levenshtein distance ratio + entry = max(data, key=lambda x: x['match_score']) + if entry['match_score'] < LEVENSHTEIN_RATIO_THRESHOLD: + LOG.debug('Best themoviedb match not good enough: %s', entry) + return + + # Check if we got several matches. If so, take the most popular one + best_matches = [x for x in data if + x['match_score'] == entry['match_score'] + and 'popularity' in x] + entry = max(best_matches, key=lambda x: x['popularity']) + LOG.debug('Found themoviedb match: %s', entry) + + # lookup external tmdb_id and perform artwork lookup on fanart.tv + tmdb_id = entry.get('id') + parameters = {'api_key': API_KEY} + if media_type == 'movie': + url = 'https://api.themoviedb.org/3/movie/%s' % tmdb_id + parameters['append_to_response'] = 'videos' + elif media_type == 'tv': + url = 'https://api.themoviedb.org/3/tv/%s' % tmdb_id + parameters['append_to_response'] = 'external_ids,videos' + media_id, poster, background = None, None, None + for language in (v.KODILANGUAGE, 'en'): + parameters['language'] = language + data = DU().downloadUrl(url, + authenticate=False, + parameters=parameters, + timeout=7) + try: + data.get('test') + except AttributeError: + LOG.warning('Could not download %s with parameters %s', + url, parameters) + continue + if collection is False: + if data.get('imdb_id'): + media_id = str(data.get('imdb_id')) + break + if (data.get('external_ids') and + data['external_ids'].get('tvdb_id')): + media_id = str(data['external_ids']['tvdb_id']) + break + else: + if not data.get('belongs_to_collection'): + continue + media_id = data.get('belongs_to_collection').get('id') + if not media_id: + continue + media_id = str(media_id) + LOG.debug('Retrieved collections tmdb id %s for %s', + media_id, title) + url = 'https://api.themoviedb.org/3/collection/%s' % media_id + data = DU().downloadUrl(url, + authenticate=False, + parameters=parameters, + timeout=7) + try: + data.get('poster_path') + except AttributeError: + LOG.debug('Could not find TheMovieDB poster paths for %s' + ' in the language %s', title, language) + continue + if not poster and data.get('poster_path'): + poster = ('https://image.tmdb.org/t/p/original%s' % + data.get('poster_path')) + if not background and data.get('backdrop_path'): + background = ('https://image.tmdb.org/t/p/original%s' % + data.get('backdrop_path')) + return media_id, poster, background + + +def __year_almost_matches(year, entry): + try: + entry_year = int(entry['release_date'][0:4]) + except (KeyError, ValueError): + return True + return abs(year - entry_year) <= YEARS_APART + + +def sanitize_string(s): + s = s.lower().strip() + # Get rid of chars in EXCLUDE_CHARS + s = ''.join(character for character in s if character not in EXCLUDE_CHARS) + # Get rid of multiple spaces + s = ' '.join(s.split()) + return s + + +def levenshtein_distance_ratio(s, t): + """ + Calculates levenshtein distance ratio between two strings. + The more similar the strings, the closer the result will be to 1. + The farther disjunct the string, the closer the result to 0 + + https://www.datacamp.com/community/tutorials/fuzzy-string-python + """ + # Initialize matrix of zeros + rows = len(s) + 1 + cols = len(t) + 1 + distance = [[0 for x in range(cols)] for y in range(rows)] + + # Populate matrix of zeros with the indeces of each character of both strings + for i in range(1, rows): + for k in range(1,cols): + distance[i][0] = i + distance[0][k] = k + + # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions + for col in range(1, cols): + for row in range(1, rows): + if s[row-1] == t[col-1]: + cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0 + else: + # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio + # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1. + cost = 2 + distance[row][col] = min(distance[row-1][col] + 1, # Cost of deletions + distance[row][col-1] + 1, # Cost of insertions + distance[row-1][col-1] + cost) # Cost of substitutions + return ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))