Skip to main content
Sign in
Snippets Groups Projects
Commit 9915c309 authored by Thomas Morrell's avatar Thomas Morrell Committed by Nicola
Browse files

encoding: cleanup detection and override ASCII to default encoding

parent a8aa2c84
No related branches found
No related tags found
No related merge requests found
......@@ -2,6 +2,8 @@
#
# This file is part of Invenio.
# Copyright (C) 2016-2019 CERN.
# Copyright (C) 2023 Northwestern University.
# Copyright (C) 2023 California Institute of Technology.
#
# Invenio is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
......@@ -23,17 +25,24 @@ def detect_encoding(fp, default=None):
"""
init_pos = fp.tell()
try:
sample = fp.read(
current_app.config.get('PREVIEWER_CHARDET_BYTES', 1024))
chardet_size = current_app.config.get("PREVIEWER_CHARDET_BYTES", 1024)
threshold = current_app.config.get("PREVIEWER_CHARDET_CONFIDENCE", 0.9)
sample = fp.read(chardet_size)
# Result contains 'confidence' and 'encoding'
result = cchardet.detect(sample)
threshold = current_app.config.get('PREVIEWER_CHARDET_CONFIDENCE', 0.9)
if result.get('confidence', 0) > threshold:
return result.get('encoding', default)
else:
return default
confidence = result.get("confidence", 0) or 0
encoding = result.get("encoding", default) or default
# if low confidence or ascii, override to default (usually utf8 which is
# better in case of unicode beyond checked range)
if confidence <= threshold or encoding == "ASCII":
encoding = default
return encoding
except Exception:
current_app.logger.warning('Encoding detection failed.', exc_info=True)
current_app.logger.warning("Encoding detection failed.", exc_info=True)
return default
finally:
fp.seek(init_pos)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment