Fuzzy match book titles to eliminate duplicates

Titles with a Levenshtein ratio of 70% or greater are considered duplicates.
This commit is contained in:
Jonathan Rehm 2017-08-24 08:53:53 -07:00
parent 53c687251e
commit 58abc1d024
2 changed files with 29 additions and 7 deletions

View File

@ -13,6 +13,12 @@ try:
except ImportError:
goodreads_support = False
try:
import Levenshtein
levenshtein_support = True
except ImportError:
levenshtein_support = False
try:
from functools import reduce
except ImportError:
@ -1138,17 +1144,32 @@ def author(book_id, page):
if goodreads_support and config.config_use_goodreads:
gc = GoodreadsClient(config.config_goodreads_api_key, config.config_goodreads_api_secret)
author_info = gc.find_author(author_name=name)
# Get all identifiers (ISBN, Goodreads, etc) and filter author's books by that list so we show fewer duplicates
# Note: Not all images will be shown, even though they're available on Goodreads.com.
# See https://www.goodreads.com/topic/show/18213769-goodreads-book-images
identifiers = reduce(lambda acc, book: acc + map(lambda identifier: identifier.val, book.identifiers), entries.all(), [])
other_books = filter(lambda book: book.isbn not in identifiers and book.gid["#text"] not in identifiers, author_info.books)
other_books = get_unique_other_books(entries.all(), author_info.books)
return render_title_template('author.html', entries=entries, pagination=pagination,
title=name, author=author_info, other_books=other_books)
def get_unique_other_books(library_books, author_books):
# Get all identifiers (ISBN, Goodreads, etc) and filter author's books by that list so we show fewer duplicates
# Note: Not all images will be shown, even though they're available on Goodreads.com.
# See https://www.goodreads.com/topic/show/18213769-goodreads-book-images
identifiers = reduce(lambda acc, book: acc + map(lambda identifier: identifier.val, book.identifiers), library_books, [])
other_books = filter(lambda book: book.isbn not in identifiers and book.gid["#text"] not in identifiers, author_books)
# Fuzzy match book titles
if levenshtein_support:
library_titles = reduce(lambda acc, book: acc + [book.title], library_books, [])
other_books = filter(lambda author_book: not filter(
lambda library_book:
Levenshtein.ratio(re.sub(r"\(.*\)", "", author_book.title), library_book) > 0.7, # Remove items in parentheses before comparing
library_titles
), other_books)
return other_books
@app.route("/series")
@login_required_if_no_ano
def series_list():

View File

@ -11,4 +11,5 @@ PyYAML==3.12
rsa==3.4.2
six==1.10.0
uritemplate==3.0.0
goodreads==0.3.2
goodreads>=0.3.2
python-Levenshtein>=0.12.0