Fuzzy match book titles to eliminate duplicates
Titles with a Levenshtein ratio of 70% or greater are considered duplicates.
This commit is contained in:
parent
53c687251e
commit
58abc1d024
33
cps/web.py
33
cps/web.py
@ -13,6 +13,12 @@ try:
|
||||
except ImportError:
|
||||
goodreads_support = False
|
||||
|
||||
try:
|
||||
import Levenshtein
|
||||
levenshtein_support = True
|
||||
except ImportError:
|
||||
levenshtein_support = False
|
||||
|
||||
try:
|
||||
from functools import reduce
|
||||
except ImportError:
|
||||
@ -1138,17 +1144,32 @@ def author(book_id, page):
|
||||
if goodreads_support and config.config_use_goodreads:
|
||||
gc = GoodreadsClient(config.config_goodreads_api_key, config.config_goodreads_api_secret)
|
||||
author_info = gc.find_author(author_name=name)
|
||||
|
||||
# Get all identifiers (ISBN, Goodreads, etc) and filter author's books by that list so we show fewer duplicates
|
||||
# Note: Not all images will be shown, even though they're available on Goodreads.com.
|
||||
# See https://www.goodreads.com/topic/show/18213769-goodreads-book-images
|
||||
identifiers = reduce(lambda acc, book: acc + map(lambda identifier: identifier.val, book.identifiers), entries.all(), [])
|
||||
other_books = filter(lambda book: book.isbn not in identifiers and book.gid["#text"] not in identifiers, author_info.books)
|
||||
other_books = get_unique_other_books(entries.all(), author_info.books)
|
||||
|
||||
return render_title_template('author.html', entries=entries, pagination=pagination,
|
||||
title=name, author=author_info, other_books=other_books)
|
||||
|
||||
|
||||
def get_unique_other_books(library_books, author_books):
|
||||
# Get all identifiers (ISBN, Goodreads, etc) and filter author's books by that list so we show fewer duplicates
|
||||
# Note: Not all images will be shown, even though they're available on Goodreads.com.
|
||||
# See https://www.goodreads.com/topic/show/18213769-goodreads-book-images
|
||||
identifiers = reduce(lambda acc, book: acc + map(lambda identifier: identifier.val, book.identifiers), library_books, [])
|
||||
other_books = filter(lambda book: book.isbn not in identifiers and book.gid["#text"] not in identifiers, author_books)
|
||||
|
||||
# Fuzzy match book titles
|
||||
if levenshtein_support:
|
||||
library_titles = reduce(lambda acc, book: acc + [book.title], library_books, [])
|
||||
other_books = filter(lambda author_book: not filter(
|
||||
lambda library_book:
|
||||
Levenshtein.ratio(re.sub(r"\(.*\)", "", author_book.title), library_book) > 0.7, # Remove items in parentheses before comparing
|
||||
library_titles
|
||||
), other_books)
|
||||
|
||||
return other_books
|
||||
|
||||
|
||||
|
||||
@app.route("/series")
|
||||
@login_required_if_no_ano
|
||||
def series_list():
|
||||
|
@ -11,4 +11,5 @@ PyYAML==3.12
|
||||
rsa==3.4.2
|
||||
six==1.10.0
|
||||
uritemplate==3.0.0
|
||||
goodreads==0.3.2
|
||||
goodreads>=0.3.2
|
||||
python-Levenshtein>=0.12.0
|
||||
|
Loading…
Reference in New Issue
Block a user