Back to home page

OSCL-LXR

 
 

    


0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017 
0018 import itertools
0019 import os
0020 import re
0021 from collections import namedtuple
0022 
0023 # To avoid adding a new direct dependency, we import markdown from within mkdocs.
0024 from mkdocs.structure.pages import markdown
0025 
0026 from pyspark.java_gateway import launch_gateway
0027 
0028 
0029 ExpressionInfo = namedtuple("ExpressionInfo", "name usage examples group")
0030 
0031 groups = {
0032     "agg_funcs", "array_funcs", "datetime_funcs",
0033     "json_funcs", "map_funcs", "window_funcs",
0034 }
0035 
0036 
0037 def _list_grouped_function_infos(jvm):
0038     """
0039     Returns a list of function information grouped by each group value via JVM.
0040     Sorts wrapped expression infos in each group by name and returns them.
0041     """
0042 
0043     jinfos = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos()
0044     infos = []
0045 
0046     for jinfo in filter(lambda x: x.getGroup() in groups, jinfos):
0047         name = jinfo.getName()
0048         usage = jinfo.getUsage()
0049         usage = usage.replace("_FUNC_", name) if usage is not None else usage
0050         infos.append(ExpressionInfo(
0051             name=name,
0052             usage=usage,
0053             examples=jinfo.getExamples().replace("_FUNC_", name),
0054             group=jinfo.getGroup()))
0055 
0056     # Groups expression info by each group value
0057     grouped_infos = itertools.groupby(sorted(infos, key=lambda x: x.group), key=lambda x: x.group)
0058     # Then, sort expression infos in each group by name
0059     return [(k, sorted(g, key=lambda x: x.name)) for k, g in grouped_infos]
0060 
0061 
0062 # TODO(SPARK-31499): Needs to add a column to describe arguments and their types
0063 def _make_pretty_usage(infos):
0064     """
0065     Makes the usage description pretty and returns a formatted string.
0066 
0067     Expected input:
0068 
0069         func(*) - ...
0070 
0071         func(expr[, expr...]) - ...
0072 
0073     Expected output:
0074     <table class="table">
0075       <thead>
0076         <tr>
0077           <th style="width:25%">Function</th>
0078           <th>Description</th>
0079         </tr>
0080       </thead>
0081       <tbody>
0082         <tr>
0083           <td>func(*)</td>
0084           <td>...</td>
0085         </tr>
0086         <tr>
0087           <td>func(expr[, expr...])</td>
0088           <td>...</td>
0089         </tr>
0090       </tbody>
0091       ...
0092     </table>
0093 
0094     """
0095 
0096     result = []
0097     result.append("<table class=\"table\">")
0098     result.append("  <thead>")
0099     result.append("    <tr>")
0100     result.append("      <th style=\"width:25%\">Function</th>")
0101     result.append("      <th>Description</th>")
0102     result.append("    </tr>")
0103     result.append("  </thead>")
0104     result.append("  <tbody>")
0105 
0106     for info in infos:
0107         # Extracts (signature, description) pairs from `info.usage`.
0108         # Expected formats are as follows;
0109         #  - `_FUNC_(...) - description`, or
0110         #  - `_FUNC_ - description`
0111         usages = iter(re.split(r"(%s.*) - " % info.name, info.usage.strip())[1:])
0112         for (sig, description) in zip(usages, usages):
0113             result.append("    <tr>")
0114             result.append("      <td>%s</td>" % sig)
0115             result.append("      <td>%s</td>" % description.strip())
0116             result.append("    </tr>")
0117 
0118     result.append("  </tbody>")
0119     result.append("</table>\n")
0120     return "\n".join(result)
0121 
0122 
0123 def _make_pretty_examples(jspark, infos):
0124     """
0125     Makes the examples description pretty and returns a formatted string if `infos`
0126     has any `examples` starting with the example prefix. Otherwise, returns None.
0127 
0128     Expected input:
0129 
0130         Examples:
0131           > SELECT func(col)...;
0132            ...
0133           > SELECT func(col)...;
0134            ...
0135 
0136     Expected output:
0137     <div class="codehilite"><pre><span></span>
0138       <span class="c1">-- func</span>
0139       <span class="k">SELECT</span>
0140       ...
0141     </pre></div>
0142     ```
0143 
0144     """
0145 
0146     pretty_output = ""
0147     for info in infos:
0148         if info.examples.startswith("\n    Examples:"):
0149             output = []
0150             output.append("-- %s" % info.name)
0151             query_examples = filter(lambda x: x.startswith("      > "), info.examples.split("\n"))
0152             for query_example in query_examples:
0153                 query = query_example.lstrip("      > ")
0154                 print("    %s" % query)
0155                 query_output = jspark.sql(query).showString(20, 20, False)
0156                 output.append(query)
0157                 output.append(query_output)
0158             pretty_output += "\n" + "\n".join(output)
0159     if pretty_output != "":
0160         return markdown.markdown(
0161             "```sql%s```" % pretty_output, extensions=['codehilite', 'fenced_code'])
0162 
0163 
0164 def generate_functions_table_html(jvm, html_output_dir):
0165     """
0166     Generates a HTML file after listing the function information. The output file
0167     is created under `html_output_dir`.
0168 
0169     Expected output:
0170 
0171     <table class="table">
0172       <thead>
0173         <tr>
0174           <th style="width:25%">Function</th>
0175           <th>Description</th>
0176         </tr>
0177       </thead>
0178       <tbody>
0179         <tr>
0180           <td>func(*)</td>
0181           <td>...</td>
0182         </tr>
0183         <tr>
0184           <td>func(expr[, expr...])</td>
0185           <td>...</td>
0186         </tr>
0187       </tbody>
0188       ...
0189     </table>
0190 
0191     """
0192     for key, infos in _list_grouped_function_infos(jvm):
0193         function_table = _make_pretty_usage(infos)
0194         key = key.replace("_", "-")
0195         with open("%s/generated-%s-table.html" % (html_output_dir, key), 'w') as table_html:
0196             table_html.write(function_table)
0197 
0198 
0199 def generate_functions_examples_html(jvm, jspark, html_output_dir):
0200     """
0201     Generates a HTML file after listing and executing the function information.
0202     The output file is created under `html_output_dir`.
0203 
0204     Expected output:
0205 
0206     <div class="codehilite"><pre><span></span>
0207       <span class="c1">-- func</span>
0208       <span class="k">SELECT</span>
0209       ...
0210     </pre></div>
0211 
0212     """
0213     print("Running SQL examples to generate formatted output.")
0214     for key, infos in _list_grouped_function_infos(jvm):
0215         examples = _make_pretty_examples(jspark, infos)
0216         key = key.replace("_", "-")
0217         if examples is not None:
0218             with open("%s/generated-%s-examples.html" % (
0219                     html_output_dir, key), 'w') as examples_html:
0220                 examples_html.write(examples)
0221 
0222 
0223 if __name__ == "__main__":
0224     jvm = launch_gateway().jvm
0225     jspark = jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate()
0226     jspark.sparkContext().setLogLevel("ERROR")  # Make it less noisy.
0227     spark_root_dir = os.path.dirname(os.path.dirname(__file__))
0228     html_output_dir = os.path.join(spark_root_dir, "docs")
0229     generate_functions_table_html(jvm, html_output_dir)
0230     generate_functions_examples_html(jvm, jspark, html_output_dir)