Skip to content
Snippets Groups Projects
Commit 9915c309 authored by Thomas Morrell's avatar Thomas Morrell Committed by Nicola
Browse files

encoding: cleanup detection and override ASCII to default encoding

parent a8aa2c84
Branches
No related tags found
No related merge requests found
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
# #
# This file is part of Invenio. # This file is part of Invenio.
# Copyright (C) 2016-2019 CERN. # Copyright (C) 2016-2019 CERN.
# Copyright (C) 2023 Northwestern University.
# Copyright (C) 2023 California Institute of Technology.
# #
# Invenio is free software; you can redistribute it and/or modify it # Invenio is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details. # under the terms of the MIT License; see LICENSE file for more details.
...@@ -23,17 +25,24 @@ def detect_encoding(fp, default=None): ...@@ -23,17 +25,24 @@ def detect_encoding(fp, default=None):
""" """
init_pos = fp.tell() init_pos = fp.tell()
try: try:
sample = fp.read( chardet_size = current_app.config.get("PREVIEWER_CHARDET_BYTES", 1024)
current_app.config.get('PREVIEWER_CHARDET_BYTES', 1024)) threshold = current_app.config.get("PREVIEWER_CHARDET_CONFIDENCE", 0.9)
sample = fp.read(chardet_size)
# Result contains 'confidence' and 'encoding' # Result contains 'confidence' and 'encoding'
result = cchardet.detect(sample) result = cchardet.detect(sample)
threshold = current_app.config.get('PREVIEWER_CHARDET_CONFIDENCE', 0.9) confidence = result.get("confidence", 0) or 0
if result.get('confidence', 0) > threshold: encoding = result.get("encoding", default) or default
return result.get('encoding', default)
else: # if low confidence or ascii, override to default (usually utf8 which is
return default # better in case of unicode beyond checked range)
if confidence <= threshold or encoding == "ASCII":
encoding = default
return encoding
except Exception: except Exception:
current_app.logger.warning('Encoding detection failed.', exc_info=True) current_app.logger.warning("Encoding detection failed.", exc_info=True)
return default return default
finally: finally:
fp.seek(init_pos) fp.seek(init_pos)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment