Coverage for src/mkdocs_gallery/py_source_parser.py: 87%
91 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-09-30 08:26 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-09-30 08:26 +0000
1# Authors: Sylvain MARIE <sylvain.marie@se.com>
2# + All contributors to <https://github.com/smarie/mkdocs-gallery>
3#
4# Original idea and code: sphinx-gallery, <https://sphinx-gallery.github.io>
5# License: 3-clause BSD, <https://github.com/smarie/mkdocs-gallery/blob/master/LICENSE>
6"""
7Parser for python source files
8"""
10from __future__ import absolute_import, division, print_function
12import ast
13import platform
14import re
15import tokenize
16from packaging.version import parse as parse_version
17from io import BytesIO
18from pathlib import Path
19from textwrap import dedent
20from typing import Dict, List, Tuple, Union
22from .errors import ExtensionError
23from .mkdocs_compatibility import getLogger
25logger = getLogger("mkdocs-gallery")
27SYNTAX_ERROR_DOCSTRING = """
28SyntaxError
29===========
31Example script with invalid Python syntax
32"""
34# The pattern for in-file config comments is designed to not greedily match
35# newlines at the start and end, except for one newline at the end. This
36# ensures that the matched pattern can be removed from the code without
37# changing the block structure; i.e. empty newlines are preserved, e.g. in
38#
39# a = 1
40#
41# # mkdocs_gallery_thumbnail_number = 2
42#
43# b = 2
44INFILE_CONFIG_PATTERN = re.compile(r"^[\ \t]*#\s*mkdocs_gallery_([A-Za-z0-9_]+)(\s*=\s*(.+))?[\ \t]*\n?", re.MULTILINE)
47def parse_source_file(file: Path):
48 """Parse source file into AST node.
50 Parameters
51 ----------
52 file : Path
53 File path
55 Returns
56 -------
57 node : AST node
58 content : utf-8 encoded string
59 """
60 # with codecs.open(filename, 'r', 'utf-8') as fid:
61 # content = fid.read()
62 content = file.read_text(encoding="utf-8")
64 # change from Windows format to UNIX for uniformity
65 content = content.replace("\r\n", "\n")
67 try:
68 node = ast.parse(content)
69 return node, content
70 except SyntaxError:
71 return None, content
74def _get_docstring_and_rest(file: Path):
75 """Separate ``filename`` content between docstring and the rest.
77 Strongly inspired from ast.get_docstring.
79 Parameters
80 ----------
81 file : Path
82 The source file
84 Returns
85 -------
86 docstring : str
87 docstring of ``filename``
88 rest : str
89 ``filename`` content without the docstring
90 lineno : int
91 The line number.
92 node : ast Node
93 The node.
94 """
95 node, content = parse_source_file(file)
97 if node is None:
98 return SYNTAX_ERROR_DOCSTRING, content, 1, node
100 if not isinstance(node, ast.Module): 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true
101 raise ExtensionError("This function only supports modules. " "You provided {0}".format(node.__class__.__name__))
102 if not (node.body and isinstance(node.body[0], ast.Expr) and isinstance(node.body[0].value, ast.Str)): 102 ↛ 103line 102 didn't jump to line 103 because the condition on line 102 was never true
103 raise ExtensionError(
104 f'Could not find docstring in file "{file}". '
105 "A docstring is required by mkdocs-gallery "
106 'unless the file is ignored by "ignore_pattern"'
107 )
109 if parse_version(platform.python_version()) >= parse_version("3.7"): 109 ↛ 126line 109 didn't jump to line 126 because the condition on line 109 was always true
110 docstring = ast.get_docstring(node)
111 assert docstring is not None # noqa # should be guaranteed above
112 # This is just for backward compat
113 if len(node.body[0].value.s) and node.body[0].value.s[0] == "\n":
114 # just for strict backward compat here
115 docstring = "\n" + docstring
116 ts = tokenize.tokenize(BytesIO(content.encode()).readline)
117 # find the first string according to the tokenizer and get its end row
118 for tk in ts: 118 ↛ 123line 118 didn't jump to line 123 because the loop on line 118 didn't complete
119 if tk.exact_type == 3:
120 lineno, _ = tk.end
121 break
122 else:
123 lineno = 0
124 else:
125 # TODO this block can be removed when python 3.6 support is dropped
126 docstring_node = node.body[0]
127 docstring = docstring_node.value.s
128 lineno = docstring_node.lineno # The last line of the string.
130 # This get the content of the file after the docstring last line
131 # Note: 'maxsplit' argument is not a keyword argument in python2
132 rest = "\n".join(content.split("\n")[lineno:])
133 lineno += 1
134 return docstring, rest, lineno, node
137def extract_file_config(content):
138 """
139 Pull out the file-specific config specified in the docstring.
140 """
141 file_conf = {}
142 for match in re.finditer(INFILE_CONFIG_PATTERN, content):
143 name = match.group(1)
144 value = match.group(3)
145 if value is None: # a flag rather than a config setting 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true
146 continue
147 try:
148 value = ast.literal_eval(value)
149 except (SyntaxError, ValueError):
150 logger.warning("mkdocs-gallery option %s was passed invalid value %s", name, value)
151 else:
152 file_conf[name] = value
153 return file_conf
156def split_code_and_text_blocks(
157 source_file: Union[str, Path], return_node=False
158) -> Union[Tuple[Dict, List], Tuple[Dict, List, ast.AST]]:
159 """Return list with source file separated into code and text blocks.
161 Parameters
162 ----------
163 source_file : Union[str, Path]
164 Path to the source file.
165 return_node : bool
166 If True, return the ast node.
168 Returns
169 -------
170 file_conf : dict
171 File-specific settings given in source file comments as:
172 ``# mkdocs_gallery_<name> = <value>``
173 blocks : list
174 (label, content, line_number)
175 List where each element is a tuple with the label ('text' or 'code'),
176 the corresponding content string of block and the leading line number
177 node : ast Node
178 The parsed node.
179 """
180 source_file = Path(source_file)
181 docstring, rest_of_content, lineno, node = _get_docstring_and_rest(source_file)
182 blocks = [("text", docstring, 1)]
184 file_conf = extract_file_config(rest_of_content)
186 pattern = re.compile(
187 r"(?P<header_line>^#{20,}.*|^# ?%%.*)\s(?P<text_content>(?:^#.*\s?)*)",
188 flags=re.M,
189 )
190 sub_pat = re.compile("^#", flags=re.M)
192 pos_so_far = 0
193 for match in re.finditer(pattern, rest_of_content):
194 code_block_content = rest_of_content[pos_so_far : match.start()]
195 if code_block_content.strip():
196 blocks.append(("code", code_block_content, lineno))
197 lineno += code_block_content.count("\n")
199 lineno += 1 # Ignored header line of hashes.
200 text_content = match.group("text_content")
201 text_block_content = dedent(re.sub(sub_pat, "", text_content)).lstrip()
202 if text_block_content.strip():
203 blocks.append(("text", text_block_content, lineno))
204 lineno += text_content.count("\n")
206 pos_so_far = match.end()
208 remaining_content = rest_of_content[pos_so_far:]
209 if remaining_content.strip():
210 blocks.append(("code", remaining_content, lineno))
212 out = (file_conf, blocks)
213 if return_node:
214 out += (node,)
215 return out
218def remove_config_comments(code_block):
219 """
220 Return the content of *code_block* with in-file config comments removed.
222 Comment lines of the pattern '# mkdocs_gallery_[option] = [val]' are
223 removed, but surrounding empty lines are preserved.
225 Parameters
226 ----------
227 code_block : str
228 A code segment.
229 """
230 parsed_code, _ = re.subn(INFILE_CONFIG_PATTERN, "", code_block)
231 return parsed_code