Source code for invenio_records_rest.schemas.fields.sanitizedunicode

# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2016-2018 CERN.
#
# Invenio is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Sanitized Unicode string field."""

from ftfy import fix_text

from .trimmedstring import TrimmedString


[docs]class SanitizedUnicode(TrimmedString): """String field that sanitizes and fixes problematic unicode characters.""" UNWANTED_CHARACTERS = { # Zero-width space "\u200b", }
[docs] def is_valid_xml_char(self, char): """Check if a character is valid based on the XML specification.""" codepoint = ord(char) return ( 0x20 <= codepoint <= 0xD7FF or codepoint in (0x9, 0xA, 0xD) or 0xE000 <= codepoint <= 0xFFFD or 0x10000 <= codepoint <= 0x10FFFF )
def _deserialize(self, value, attr, data, **kwargs): """Deserialize sanitized string value.""" value = super()._deserialize(value, attr, data, **kwargs) value = fix_text(value) # NOTE: This `join` might be ineffiecient... There's a solution with a # large compiled regex lying around, but needs a lot of tweaking. value = "".join(filter(self.is_valid_xml_char, value)) for char in self.UNWANTED_CHARACTERS: value = value.replace(char, "") return value