From 49af2d3a4f1f51ec0c842df41b293b348574ec3f Mon Sep 17 00:00:00 2001 From: Gilles Peskine Date: Fri, 6 Dec 2019 19:20:13 +0100 Subject: [PATCH] Support non-ASCII characters in headers Filter out non-ASCII characters in automatically processed headers. Do this in a way that minimizes the code change: keep manipulating strings, but strip off non-ASCII characters when reading lines, which should only remove characters in comments that we don't parse anyway. --- scripts/generate_psa_constants.py | 11 ++++++++--- tests/scripts/test_psa_constant_names.py | 12 ++++++++---- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/scripts/generate_psa_constants.py b/scripts/generate_psa_constants.py index c2d2558094..a9de148d76 100755 --- a/scripts/generate_psa_constants.py +++ b/scripts/generate_psa_constants.py @@ -270,11 +270,16 @@ class MacroCollector: # Other macro without parameter return + _nonascii_re = re.compile(rb'[^\x00-\x7f]+') + _continued_line_re = re.compile(rb'\\\r?\n\Z') def read_file(self, header_file): for line in header_file: - while line.endswith('\\\n'): + m = re.search(self._continued_line_re, line) + while m: cont = next(header_file) - line = line[:-2] + cont + line = line[:m.start(0)] + cont + m = re.search(self._continued_line_re, line) + line = re.sub(self._nonascii_re, rb'', line).decode('ascii') self.read_line(line) @staticmethod @@ -380,7 +385,7 @@ class MacroCollector: def generate_psa_constants(header_file_names, output_file_name): collector = MacroCollector() for header_file_name in header_file_names: - with open(header_file_name) as header_file: + with open(header_file_name, 'rb') as header_file: collector.read_file(header_file) temp_file_name = output_file_name + '.tmp' with open(temp_file_name, 'w') as output_file: diff --git a/tests/scripts/test_psa_constant_names.py b/tests/scripts/test_psa_constant_names.py index 7553394f90..4829321374 100755 --- a/tests/scripts/test_psa_constant_names.py +++ b/tests/scripts/test_psa_constant_names.py @@ -43,12 +43,14 @@ class read_file_lines: except that if process(line) raises an exception, then the read_file_lines snippet annotates the exception with the file name and line number. """ - def __init__(self, filename): + def __init__(self, filename, binary=False): self.filename = filename self.line_number = 'entry' self.generator = None + self.binary = binary def __enter__(self): - self.generator = enumerate(open(self.filename, 'r')) + self.generator = enumerate(open(self.filename, + 'rb' if self.binary else 'r')) return self def __iter__(self): for line_number, content in self.generator: @@ -224,13 +226,15 @@ class Inputs: if m.group(3): self.argspecs[name] = self._argument_split(m.group(3)) + _nonascii_re = re.compile(rb'[^\x00-\x7f]+') def parse_header(self, filename): """Parse a C header file, looking for "#define PSA_xxx".""" - with read_file_lines(filename) as lines: + with read_file_lines(filename, binary=True) as lines: for line in lines: + line = re.sub(self._nonascii_re, rb'', line).decode('ascii') self.parse_header_line(line) - _macro_identifier_re = r'[A-Z]\w+' + _macro_identifier_re = re.compile(r'[A-Z]\w+') def generate_undeclared_names(self, expr): for name in re.findall(self._macro_identifier_re, expr): if name not in self.all_declared: