PlexKodiConnect/resources/lib/plex_api/fanart_lookup.py

182 lines
7.4 KiB
Python
Raw Normal View History

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from logging import getLogger
from re import sub
from string import punctuation
from ..downloadutils import DownloadUtils as DU
from .. import utils, variables as v
LOG = getLogger('PLEX.api.fanartlookup')
API_KEY = utils.settings('themoviedbAPIKey')
# How far apart can a video's airing date be (in years)
YEARS_APART = 1
# levenshtein_distance_ratio() returns a value between 0 (no match) and 1 (full
# match). What's the threshold?
LEVENSHTEIN_RATIO_THRESHOLD = 0.95
# Which character should we ignore when matching video titles?
EXCLUDE_CHARS = set(punctuation)
def external_item_id(title, year, plex_type, collection):
LOG.debug('Start identifying %s (%s, %s)', title, year, plex_type)
year = int(year) if year else None
media_type = 'tv' if plex_type == v.PLEX_TYPE_SHOW else plex_type
# if the title has the year in remove it as tmdb cannot deal with it...
# replace e.g. 'The Americans (2015)' with 'The Americans'
title = sub(r'\s*\(\d{4}\)$', '', title, count=1)
url = 'https://api.themoviedb.org/3/search/%s' % media_type
parameters = {
'api_key': API_KEY,
'language': v.KODILANGUAGE,
'query': title.encode('utf-8')
}
data = DU().downloadUrl(url,
authenticate=False,
parameters=parameters,
timeout=7)
try:
data = data['results']
except (AttributeError, KeyError, TypeError):
LOG.debug('No match found on themoviedb for %s (%s, %s)',
title, year, media_type)
return
LOG.debug('themoviedb returned results: %s', data)
# Some entries don't contain a title or id - get rid of them
data = [x for x in data if 'title' in x and 'id' in x]
# Get rid of all results that do NOT have a matching release year
if year:
data = [x for x in data if __year_almost_matches(year, x)]
if not data:
LOG.debug('Empty results returned by themoviedb for %s (%s, %s)',
title, year, media_type)
return
# Calculate how similar the titles are
title = sanitize_string(title)
for entry in data:
entry['match_score'] = levenshtein_distance_ratio(
sanitize_string(entry['title']), title)
# (one of the possibly many) best match using levenshtein distance ratio
entry = max(data, key=lambda x: x['match_score'])
if entry['match_score'] < LEVENSHTEIN_RATIO_THRESHOLD:
LOG.debug('Best themoviedb match not good enough: %s', entry)
return
# Check if we got several matches. If so, take the most popular one
best_matches = [x for x in data if
x['match_score'] == entry['match_score']
and 'popularity' in x]
entry = max(best_matches, key=lambda x: x['popularity'])
LOG.debug('Found themoviedb match: %s', entry)
# lookup external tmdb_id and perform artwork lookup on fanart.tv
tmdb_id = entry.get('id')
parameters = {'api_key': API_KEY}
if media_type == 'movie':
url = 'https://api.themoviedb.org/3/movie/%s' % tmdb_id
parameters['append_to_response'] = 'videos'
elif media_type == 'tv':
url = 'https://api.themoviedb.org/3/tv/%s' % tmdb_id
parameters['append_to_response'] = 'external_ids,videos'
media_id, poster, background = None, None, None
for language in (v.KODILANGUAGE, 'en'):
parameters['language'] = language
data = DU().downloadUrl(url,
authenticate=False,
parameters=parameters,
timeout=7)
try:
data.get('test')
except AttributeError:
LOG.warning('Could not download %s with parameters %s',
url, parameters)
continue
if collection is False:
if data.get('imdb_id'):
media_id = str(data.get('imdb_id'))
break
if (data.get('external_ids') and
data['external_ids'].get('tvdb_id')):
media_id = str(data['external_ids']['tvdb_id'])
break
else:
if not data.get('belongs_to_collection'):
continue
media_id = data.get('belongs_to_collection').get('id')
if not media_id:
continue
media_id = str(media_id)
LOG.debug('Retrieved collections tmdb id %s for %s',
media_id, title)
url = 'https://api.themoviedb.org/3/collection/%s' % media_id
data = DU().downloadUrl(url,
authenticate=False,
parameters=parameters,
timeout=7)
try:
data.get('poster_path')
except AttributeError:
LOG.debug('Could not find TheMovieDB poster paths for %s'
' in the language %s', title, language)
continue
if not poster and data.get('poster_path'):
poster = ('https://image.tmdb.org/t/p/original%s' %
data.get('poster_path'))
if not background and data.get('backdrop_path'):
background = ('https://image.tmdb.org/t/p/original%s' %
data.get('backdrop_path'))
return media_id, poster, background
def __year_almost_matches(year, entry):
try:
entry_year = int(entry['release_date'][0:4])
except (KeyError, ValueError):
return True
return abs(year - entry_year) <= YEARS_APART
def sanitize_string(s):
s = s.lower().strip()
# Get rid of chars in EXCLUDE_CHARS
s = ''.join(character for character in s if character not in EXCLUDE_CHARS)
# Get rid of multiple spaces
s = ' '.join(s.split())
return s
def levenshtein_distance_ratio(s, t):
"""
Calculates levenshtein distance ratio between two strings.
The more similar the strings, the closer the result will be to 1.
The farther disjunct the string, the closer the result to 0
https://www.datacamp.com/community/tutorials/fuzzy-string-python
"""
# Initialize matrix of zeros
rows = len(s) + 1
cols = len(t) + 1
distance = [[0 for x in range(cols)] for y in range(rows)]
# Populate matrix of zeros with the indeces of each character of both strings
for i in range(1, rows):
for k in range(1,cols):
distance[i][0] = i
distance[0][k] = k
# Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions
for col in range(1, cols):
for row in range(1, rows):
if s[row-1] == t[col-1]:
cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
else:
# In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
# the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
cost = 2
distance[row][col] = min(distance[row-1][col] + 1, # Cost of deletions
distance[row][col-1] + 1, # Cost of insertions
distance[row-1][col-1] + cost) # Cost of substitutions
return ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))