Coverage for src/mkdocs_gallery/py_source_parser.py: 87%

91 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-09-30 08:26 +0000

1# Authors: Sylvain MARIE <sylvain.marie@se.com> 

2# + All contributors to <https://github.com/smarie/mkdocs-gallery> 

3# 

4# Original idea and code: sphinx-gallery, <https://sphinx-gallery.github.io> 

5# License: 3-clause BSD, <https://github.com/smarie/mkdocs-gallery/blob/master/LICENSE> 

6""" 

7Parser for python source files 

8""" 

9 

10from __future__ import absolute_import, division, print_function 

11 

12import ast 

13import platform 

14import re 

15import tokenize 

16from packaging.version import parse as parse_version 

17from io import BytesIO 

18from pathlib import Path 

19from textwrap import dedent 

20from typing import Dict, List, Tuple, Union 

21 

22from .errors import ExtensionError 

23from .mkdocs_compatibility import getLogger 

24 

25logger = getLogger("mkdocs-gallery") 

26 

27SYNTAX_ERROR_DOCSTRING = """ 

28SyntaxError 

29=========== 

30 

31Example script with invalid Python syntax 

32""" 

33 

34# The pattern for in-file config comments is designed to not greedily match 

35# newlines at the start and end, except for one newline at the end. This 

36# ensures that the matched pattern can be removed from the code without 

37# changing the block structure; i.e. empty newlines are preserved, e.g. in 

38# 

39# a = 1 

40# 

41# # mkdocs_gallery_thumbnail_number = 2 

42# 

43# b = 2 

44INFILE_CONFIG_PATTERN = re.compile(r"^[\ \t]*#\s*mkdocs_gallery_([A-Za-z0-9_]+)(\s*=\s*(.+))?[\ \t]*\n?", re.MULTILINE) 

45 

46 

47def parse_source_file(file: Path): 

48 """Parse source file into AST node. 

49 

50 Parameters 

51 ---------- 

52 file : Path 

53 File path 

54 

55 Returns 

56 ------- 

57 node : AST node 

58 content : utf-8 encoded string 

59 """ 

60 # with codecs.open(filename, 'r', 'utf-8') as fid: 

61 # content = fid.read() 

62 content = file.read_text(encoding="utf-8") 

63 

64 # change from Windows format to UNIX for uniformity 

65 content = content.replace("\r\n", "\n") 

66 

67 try: 

68 node = ast.parse(content) 

69 return node, content 

70 except SyntaxError: 

71 return None, content 

72 

73 

74def _get_docstring_and_rest(file: Path): 

75 """Separate ``filename`` content between docstring and the rest. 

76 

77 Strongly inspired from ast.get_docstring. 

78 

79 Parameters 

80 ---------- 

81 file : Path 

82 The source file 

83 

84 Returns 

85 ------- 

86 docstring : str 

87 docstring of ``filename`` 

88 rest : str 

89 ``filename`` content without the docstring 

90 lineno : int 

91 The line number. 

92 node : ast Node 

93 The node. 

94 """ 

95 node, content = parse_source_file(file) 

96 

97 if node is None: 

98 return SYNTAX_ERROR_DOCSTRING, content, 1, node 

99 

100 if not isinstance(node, ast.Module): 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true

101 raise ExtensionError("This function only supports modules. " "You provided {0}".format(node.__class__.__name__)) 

102 if not (node.body and isinstance(node.body[0], ast.Expr) and isinstance(node.body[0].value, ast.Str)): 102 ↛ 103line 102 didn't jump to line 103 because the condition on line 102 was never true

103 raise ExtensionError( 

104 f'Could not find docstring in file "{file}". ' 

105 "A docstring is required by mkdocs-gallery " 

106 'unless the file is ignored by "ignore_pattern"' 

107 ) 

108 

109 if parse_version(platform.python_version()) >= parse_version("3.7"): 109 ↛ 126line 109 didn't jump to line 126 because the condition on line 109 was always true

110 docstring = ast.get_docstring(node) 

111 assert docstring is not None # noqa # should be guaranteed above 

112 # This is just for backward compat 

113 if len(node.body[0].value.s) and node.body[0].value.s[0] == "\n": 

114 # just for strict backward compat here 

115 docstring = "\n" + docstring 

116 ts = tokenize.tokenize(BytesIO(content.encode()).readline) 

117 # find the first string according to the tokenizer and get its end row 

118 for tk in ts: 118 ↛ 123line 118 didn't jump to line 123 because the loop on line 118 didn't complete

119 if tk.exact_type == 3: 

120 lineno, _ = tk.end 

121 break 

122 else: 

123 lineno = 0 

124 else: 

125 # TODO this block can be removed when python 3.6 support is dropped 

126 docstring_node = node.body[0] 

127 docstring = docstring_node.value.s 

128 lineno = docstring_node.lineno # The last line of the string. 

129 

130 # This get the content of the file after the docstring last line 

131 # Note: 'maxsplit' argument is not a keyword argument in python2 

132 rest = "\n".join(content.split("\n")[lineno:]) 

133 lineno += 1 

134 return docstring, rest, lineno, node 

135 

136 

137def extract_file_config(content): 

138 """ 

139 Pull out the file-specific config specified in the docstring. 

140 """ 

141 file_conf = {} 

142 for match in re.finditer(INFILE_CONFIG_PATTERN, content): 

143 name = match.group(1) 

144 value = match.group(3) 

145 if value is None: # a flag rather than a config setting 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true

146 continue 

147 try: 

148 value = ast.literal_eval(value) 

149 except (SyntaxError, ValueError): 

150 logger.warning("mkdocs-gallery option %s was passed invalid value %s", name, value) 

151 else: 

152 file_conf[name] = value 

153 return file_conf 

154 

155 

156def split_code_and_text_blocks( 

157 source_file: Union[str, Path], return_node=False 

158) -> Union[Tuple[Dict, List], Tuple[Dict, List, ast.AST]]: 

159 """Return list with source file separated into code and text blocks. 

160 

161 Parameters 

162 ---------- 

163 source_file : Union[str, Path] 

164 Path to the source file. 

165 return_node : bool 

166 If True, return the ast node. 

167 

168 Returns 

169 ------- 

170 file_conf : dict 

171 File-specific settings given in source file comments as: 

172 ``# mkdocs_gallery_<name> = <value>`` 

173 blocks : list 

174 (label, content, line_number) 

175 List where each element is a tuple with the label ('text' or 'code'), 

176 the corresponding content string of block and the leading line number 

177 node : ast Node 

178 The parsed node. 

179 """ 

180 source_file = Path(source_file) 

181 docstring, rest_of_content, lineno, node = _get_docstring_and_rest(source_file) 

182 blocks = [("text", docstring, 1)] 

183 

184 file_conf = extract_file_config(rest_of_content) 

185 

186 pattern = re.compile( 

187 r"(?P<header_line>^#{20,}.*|^# ?%%.*)\s(?P<text_content>(?:^#.*\s?)*)", 

188 flags=re.M, 

189 ) 

190 sub_pat = re.compile("^#", flags=re.M) 

191 

192 pos_so_far = 0 

193 for match in re.finditer(pattern, rest_of_content): 

194 code_block_content = rest_of_content[pos_so_far : match.start()] 

195 if code_block_content.strip(): 

196 blocks.append(("code", code_block_content, lineno)) 

197 lineno += code_block_content.count("\n") 

198 

199 lineno += 1 # Ignored header line of hashes. 

200 text_content = match.group("text_content") 

201 text_block_content = dedent(re.sub(sub_pat, "", text_content)).lstrip() 

202 if text_block_content.strip(): 

203 blocks.append(("text", text_block_content, lineno)) 

204 lineno += text_content.count("\n") 

205 

206 pos_so_far = match.end() 

207 

208 remaining_content = rest_of_content[pos_so_far:] 

209 if remaining_content.strip(): 

210 blocks.append(("code", remaining_content, lineno)) 

211 

212 out = (file_conf, blocks) 

213 if return_node: 

214 out += (node,) 

215 return out 

216 

217 

218def remove_config_comments(code_block): 

219 """ 

220 Return the content of *code_block* with in-file config comments removed. 

221 

222 Comment lines of the pattern '# mkdocs_gallery_[option] = [val]' are 

223 removed, but surrounding empty lines are preserved. 

224 

225 Parameters 

226 ---------- 

227 code_block : str 

228 A code segment. 

229 """ 

230 parsed_code, _ = re.subn(INFILE_CONFIG_PATTERN, "", code_block) 

231 return parsed_code