diff --git a/invenio_previewer/utils.py b/invenio_previewer/utils.py index f069f23afc1adaf7962c272ee2bc49b174d5c8f4..2b93cb0e583fb09ee698ce59846e6a87d98c7832 100644 --- a/invenio_previewer/utils.py +++ b/invenio_previewer/utils.py @@ -2,6 +2,8 @@ # # This file is part of Invenio. # Copyright (C) 2016-2019 CERN. +# Copyright (C) 2023 Northwestern University. +# Copyright (C) 2023 California Institute of Technology. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -23,17 +25,24 @@ def detect_encoding(fp, default=None): """ init_pos = fp.tell() try: - sample = fp.read( - current_app.config.get('PREVIEWER_CHARDET_BYTES', 1024)) + chardet_size = current_app.config.get("PREVIEWER_CHARDET_BYTES", 1024) + threshold = current_app.config.get("PREVIEWER_CHARDET_CONFIDENCE", 0.9) + + sample = fp.read(chardet_size) + # Result contains 'confidence' and 'encoding' result = cchardet.detect(sample) - threshold = current_app.config.get('PREVIEWER_CHARDET_CONFIDENCE', 0.9) - if result.get('confidence', 0) > threshold: - return result.get('encoding', default) - else: - return default + confidence = result.get("confidence", 0) or 0 + encoding = result.get("encoding", default) or default + + # if low confidence or ascii, override to default (usually utf8 which is + # better in case of unicode beyond checked range) + if confidence <= threshold or encoding == "ASCII": + encoding = default + + return encoding except Exception: - current_app.logger.warning('Encoding detection failed.', exc_info=True) + current_app.logger.warning("Encoding detection failed.", exc_info=True) return default finally: fp.seek(init_pos)