0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 import itertools
0019 import os
0020 import re
0021 from collections import namedtuple
0022
0023
0024 from mkdocs.structure.pages import markdown
0025
0026 from pyspark.java_gateway import launch_gateway
0027
0028
0029 ExpressionInfo = namedtuple("ExpressionInfo", "name usage examples group")
0030
0031 groups = {
0032 "agg_funcs", "array_funcs", "datetime_funcs",
0033 "json_funcs", "map_funcs", "window_funcs",
0034 }
0035
0036
0037 def _list_grouped_function_infos(jvm):
0038 """
0039 Returns a list of function information grouped by each group value via JVM.
0040 Sorts wrapped expression infos in each group by name and returns them.
0041 """
0042
0043 jinfos = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos()
0044 infos = []
0045
0046 for jinfo in filter(lambda x: x.getGroup() in groups, jinfos):
0047 name = jinfo.getName()
0048 usage = jinfo.getUsage()
0049 usage = usage.replace("_FUNC_", name) if usage is not None else usage
0050 infos.append(ExpressionInfo(
0051 name=name,
0052 usage=usage,
0053 examples=jinfo.getExamples().replace("_FUNC_", name),
0054 group=jinfo.getGroup()))
0055
0056
0057 grouped_infos = itertools.groupby(sorted(infos, key=lambda x: x.group), key=lambda x: x.group)
0058
0059 return [(k, sorted(g, key=lambda x: x.name)) for k, g in grouped_infos]
0060
0061
0062
0063 def _make_pretty_usage(infos):
0064 """
0065 Makes the usage description pretty and returns a formatted string.
0066
0067 Expected input:
0068
0069 func(*) - ...
0070
0071 func(expr[, expr...]) - ...
0072
0073 Expected output:
0074 <table class="table">
0075 <thead>
0076 <tr>
0077 <th style="width:25%">Function</th>
0078 <th>Description</th>
0079 </tr>
0080 </thead>
0081 <tbody>
0082 <tr>
0083 <td>func(*)</td>
0084 <td>...</td>
0085 </tr>
0086 <tr>
0087 <td>func(expr[, expr...])</td>
0088 <td>...</td>
0089 </tr>
0090 </tbody>
0091 ...
0092 </table>
0093
0094 """
0095
0096 result = []
0097 result.append("<table class=\"table\">")
0098 result.append(" <thead>")
0099 result.append(" <tr>")
0100 result.append(" <th style=\"width:25%\">Function</th>")
0101 result.append(" <th>Description</th>")
0102 result.append(" </tr>")
0103 result.append(" </thead>")
0104 result.append(" <tbody>")
0105
0106 for info in infos:
0107
0108
0109
0110
0111 usages = iter(re.split(r"(%s.*) - " % info.name, info.usage.strip())[1:])
0112 for (sig, description) in zip(usages, usages):
0113 result.append(" <tr>")
0114 result.append(" <td>%s</td>" % sig)
0115 result.append(" <td>%s</td>" % description.strip())
0116 result.append(" </tr>")
0117
0118 result.append(" </tbody>")
0119 result.append("</table>\n")
0120 return "\n".join(result)
0121
0122
0123 def _make_pretty_examples(jspark, infos):
0124 """
0125 Makes the examples description pretty and returns a formatted string if `infos`
0126 has any `examples` starting with the example prefix. Otherwise, returns None.
0127
0128 Expected input:
0129
0130 Examples:
0131 > SELECT func(col)...;
0132 ...
0133 > SELECT func(col)...;
0134 ...
0135
0136 Expected output:
0137 <div class="codehilite"><pre><span></span>
0138 <span class="c1">-- func</span>
0139 <span class="k">SELECT</span>
0140 ...
0141 </pre></div>
0142 ```
0143
0144 """
0145
0146 pretty_output = ""
0147 for info in infos:
0148 if info.examples.startswith("\n Examples:"):
0149 output = []
0150 output.append("-- %s" % info.name)
0151 query_examples = filter(lambda x: x.startswith(" > "), info.examples.split("\n"))
0152 for query_example in query_examples:
0153 query = query_example.lstrip(" > ")
0154 print(" %s" % query)
0155 query_output = jspark.sql(query).showString(20, 20, False)
0156 output.append(query)
0157 output.append(query_output)
0158 pretty_output += "\n" + "\n".join(output)
0159 if pretty_output != "":
0160 return markdown.markdown(
0161 "```sql%s```" % pretty_output, extensions=['codehilite', 'fenced_code'])
0162
0163
0164 def generate_functions_table_html(jvm, html_output_dir):
0165 """
0166 Generates a HTML file after listing the function information. The output file
0167 is created under `html_output_dir`.
0168
0169 Expected output:
0170
0171 <table class="table">
0172 <thead>
0173 <tr>
0174 <th style="width:25%">Function</th>
0175 <th>Description</th>
0176 </tr>
0177 </thead>
0178 <tbody>
0179 <tr>
0180 <td>func(*)</td>
0181 <td>...</td>
0182 </tr>
0183 <tr>
0184 <td>func(expr[, expr...])</td>
0185 <td>...</td>
0186 </tr>
0187 </tbody>
0188 ...
0189 </table>
0190
0191 """
0192 for key, infos in _list_grouped_function_infos(jvm):
0193 function_table = _make_pretty_usage(infos)
0194 key = key.replace("_", "-")
0195 with open("%s/generated-%s-table.html" % (html_output_dir, key), 'w') as table_html:
0196 table_html.write(function_table)
0197
0198
0199 def generate_functions_examples_html(jvm, jspark, html_output_dir):
0200 """
0201 Generates a HTML file after listing and executing the function information.
0202 The output file is created under `html_output_dir`.
0203
0204 Expected output:
0205
0206 <div class="codehilite"><pre><span></span>
0207 <span class="c1">-- func</span>
0208 <span class="k">SELECT</span>
0209 ...
0210 </pre></div>
0211
0212 """
0213 print("Running SQL examples to generate formatted output.")
0214 for key, infos in _list_grouped_function_infos(jvm):
0215 examples = _make_pretty_examples(jspark, infos)
0216 key = key.replace("_", "-")
0217 if examples is not None:
0218 with open("%s/generated-%s-examples.html" % (
0219 html_output_dir, key), 'w') as examples_html:
0220 examples_html.write(examples)
0221
0222
0223 if __name__ == "__main__":
0224 jvm = launch_gateway().jvm
0225 jspark = jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate()
0226 jspark.sparkContext().setLogLevel("ERROR")
0227 spark_root_dir = os.path.dirname(os.path.dirname(__file__))
0228 html_output_dir = os.path.join(spark_root_dir, "docs")
0229 generate_functions_table_html(jvm, html_output_dir)
0230 generate_functions_examples_html(jvm, jspark, html_output_dir)