Mathics3 · rocky · Mar 14, 2021 · Feb 8, 2021 · Feb 8, 2021 · Feb 8, 2021
diff --git a/Makefile b/Makefile
@@ -11,8 +11,10 @@ RM  ?= rm
 
 .PHONY: all build \
    check clean \
-   develop dist doc doc-data djangotest \
-   gstest pytest \
+   develop dist doc \
+   inputrc-no-unicode \
+   inputrc-unicode \
+   pytest \
    rmChangeLog \
    test
 
@@ -41,6 +43,9 @@ install: build
 
 test check: pytest
 
+#: Build Sphinx HTML documentation
+doc:  mathics_scanner/data/characters.json
+	make -C docs html
 
 #: Remove derived files
 clean:

diff --git a/docs/Makefile b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/make.bat b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -0,0 +1,76 @@
+===
+API
+===
+
+.. automodule:: mathics_scanner
+  :members: is_symbol_name
+
+Tokenization
+============
+
+Tokenization is performed by the ``Tokeniser`` class. The ``next`` method
+consumes characters from a feeder and returns a token if the tokenization
+succeeds. If the tokenization fails an instance of ``TranslateError`` is
+raised.
+
+.. autoclass:: Tokeniser(object)
+  :members: __init__, incomplete, sntx_message, next
+
+The tokens returned by ``next`` are instances of the ``Token`` class:
+
+.. autoclass:: Token(object)
+  :members: __init__
+  :special-members:
+
+Feeders
+=======
+
+A feeder is an intermediate between the tokeniser and the actual file being scanned. Feeders used by the tokeniser are instances of the ``LineFeeder`` class:
+
+.. autoclass:: LineFeeder(object)
+  :members: feed, empty, message, syntax_message
+
+Specialized Feeders
+-------------------
+
+To read multiple lines of code at a time use the ``MultiLineFeeder`` class:
+
+.. autoclass:: MultiLineFeeder(LineFeeder)
+  :members: __init__
+
+To read a single line of code at a time use the ``SingleLineFeeder`` class:
+
+.. autoclass:: SingleLineFeeder(LineFeeder)
+  :members: __init__
+
+To read lines of code from a file use the ``FileLineFeeder`` class:
+
+.. autoclass:: FileLineFeeder(LineFeeder)
+  :members: __init__
+
+Character Conversions
+=====================
+
+.. automodule:: mathics_scanner.characters
+  :members: replace_wl_with_plain_text, replace_unicode_with_wl
+
+The ``mathics_scanner.characters`` module also exposes special dictionaries:
+
+``named_characters``
+  Maps fully qualified names of named characters to their corresponding
+  code-points in Wolfram's internal representation:
+
+.. code-block:: python
+
+  for named_char, code in named_characters.items():
+    print(f"The named character {named_char} maps to U+{ord(code):X}")
+
+``aliased_characters``
+  Maps the ESC sequence alias of all aliased characters to their corresponding
+  code points in Wolfram's internal representation.
+
+mathics_scanner.generate.rl_inputrc
+-----------------------------------
+
+.. automodule:: mathics_scanner.generate.rl_inputrc
+  :members: generate_inputrc
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -0,0 +1,54 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+import mathics_scanner
+
+# -- Project information -----------------------------------------------------
+
+project = 'mathics-scanner'
+copyright = '2021, The Mathics Team'
+author = 'The Mathics Team'
+
+# The full version, including alpha/beta/rc tags
+release = '1.0.1'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ["sphinx.ext.autodoc"]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
diff --git a/implementation.rst → docs/source/implementation.rst b/implementation.rst → docs/source/implementation.rst
@@ -1,9 +1,54 @@
-mathics_scanner.characters
-==========================
+==============
+Implementation
+==============
 
-This module consists mostly of translation tables between Wolfram's internal
-representation and Unicode/ASCII. For maintainability, it was decided to store
-this data in a human-readable YAML table (in ``data/named-characters.yml``).
+The Tokeniser
+=============
+
+Tokenization is performed by the ``Tokeniser`` class. The most important
+method in this class is by far the ``next`` method. This method consumes
+characters from the feeder and returns a token (if the tokenization succeeds).
+
+Tokenization Rules
+------------------
+
+Tokenization rules can are defined by declaring methods (in the ``Tokeniser``
+class) whose names are preceded by ``t_``, such as in the following example: ::
+
+   def t_SomeRule(self, match):
+       # Some logic goes here...
+       pass
+
+A tokenization rule is supposed to take a regular expression match (the 
+``match`` parameter of type ``re.Match``) and convert it to an appropriate 
+token, which is then returned by the method. The rule is also responsible for 
+updating the internal state of the tokeniser, such as incrementing the ``pos`` 
+counter.
+
+A rule is always expected to receive sane input. In other words, deciding which
+rule to call is a responsibility of the caller. Rules are are also
+automatically called from inside of ``next``.
+
+Messaging Functionality
+-----------------------
+
+Warnings and errors encountered during scanning and tokenization are collected
+in a message queue and stored in the feeders using the ``message`` and
+``syntax_message`` methods of ``LineFeeder``. The message queue is therefore a
+property of the feeder. The ``Tokeniser`` class also has a method to append
+messages to the message queue of it's feeder, the ``syntax_message`` method.
+
+The messages are stored using Mathics' internal format, but this is going to be
+revised in the next release (in fact, we plan to replace messages by errors
+entirely).
+
+Character Conversions
+=====================
+
+The ``mathics_scanner.characters`` module consists mostly of translation tables
+between Wolfram's internal representation and Unicode/ASCII. For
+maintainability, it was decided to store this data in a human-readable YAML
+table (in ``data/named-characters.yml``).
 
 The YAML table mainly contains information about how to convert a
 named character to Unicode and back. If a given character has a direct Unicode

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -0,0 +1,22 @@
+Welcome to mathics-scanner's documentation!
+===========================================
+
+This is the tokeniser or scanner portion for the Wolfram Language.
+
+As such, it also contains a full set of translation between Wolfram Language
+named characters, their Unicode/ASCII equivalents and code-points.
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   usage
+   api
+   implementation
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
@@ -0,0 +1,18 @@
+=====================
+Using mathics-scanner
+=====================
+
+This is used as the scanner inside `Mathics <https://mathics.org>`_ but it can
+also be used for tokenizing and formatting Wolfram Language code. In fact we
+intend to write one. This library is also quite usefull if you need to work
+with Wolfram Language named character and convert them to various formats.
+
+- For tokenizing and scanning Wolfram Language code, use the
+  ``mathics_scanner.Tokenizer`` class.
+- To convert between Wolfram Language named characters and Unicode/ASCII, use
+  the ``mathics_scanner.characters.replace_wl_with_plain_text`` and
+  ``mathics_scanner.characters.replace_unicode_with_wl`` functions.
+- To convert between qualified names of named characters (such ``FormalA`` for
+  ``\[FormalA]``) and Wolfram's internal representation use the
+  ``m̀athics_scanner.characters.named_characters`` dictionary.
+
diff --git a/mathics_scanner/__init__.py b/mathics_scanner/__init__.py
@@ -1,6 +1,9 @@
 # -*- coding: utf-8 -*-
 """
-Wolfram-language scanner
+This is the tokeniser or scanner portion for the Wolfram Language.
+
+As such, it also contains a full set of translation between Wolfram Language
+named characters, their Unicode/ASCII equivalents and code-points.
 """
 
 from mathics_scanner.version import __version__
@@ -11,7 +14,8 @@
     replace_unicode_with_wl,
     replace_wl_with_plain_text,
 )
-from mathics_scanner.tokeniser import is_symbol_name, Tokeniser
+# TODO: Move is_symbol_name to the characters module
+from mathics_scanner.tokeniser import is_symbol_name, Tokeniser, Token
 from mathics_scanner.errors import (
     InvalidSyntaxError,
     IncompleteSyntaxError,

diff --git a/mathics_scanner/characters.py b/mathics_scanner/characters.py
@@ -1,4 +1,10 @@
 # -*- coding: utf-8 -*-
+"""
+The ``mathics_scanner.characters`` module consists mostly of translation tables
+between Wolfram's internal representation of `named characters
+<https://reference.wolfram.com/language/tutorial/InputAndOutputInNotebooks.html#4718>`_
+and Unicode/ASCII.
+"""
 
 import re
 import ujson
@@ -49,8 +55,8 @@ def replace_wl_with_plain_text(wl_input: str, use_unicode=True) -> str:
     Language named characters. This functions replaces all occurrences of such
     characters with their corresponding Unicode/ASCII equivalents.
 
-    @param: wl_input    The string whose characters will be replaced.
-    @param: use_unicode A flag that indicates whether to use Unicode or ASCII
+    :param wl_input: The string whose characters will be replaced.
+    :param use_unicode: A flag that indicates whether to use Unicode or ASCII
                         for the conversion.
 
     Note that the occurrences of named characters in ``wl_input`` are expect to
@@ -72,7 +78,7 @@ def replace_unicode_with_wl(unicode_input: str) -> str:
     corresponding Unicode equivalents of such characters with the characters
     themselves.
 
-    @param: unicode_input The string whose characters will be replaced.
+    :param unicode_input: The string whose characters will be replaced.
 
     Note that the occurrences of named characters in the output of
     ``replace_unicode_with_wl`` are represented using Wolfram's internal

diff --git a/mathics_scanner/feed.py b/mathics_scanner/feed.py
@@ -78,8 +78,8 @@ class MultiLineFeeder(LineFeeder):
 
     def __init__(self, lines, filename=""):
         """
-        @param: lines    The source of the feeder (a string).
-        @param: filename A string that describes the source of the feeder, i.e.
+        :param lines: The source of the feeder (a string).
+        :param filename: A string that describes the source of the feeder, i.e.
                          the filename that is being feed.
         """
         super(MultiLineFeeder, self).__init__(filename)
@@ -106,8 +106,8 @@ class SingleLineFeeder(LineFeeder):
 
     def __init__(self, code, filename=""):
         """
-        @param: code     The source of the feeder (a string).
-        @param: filename A string that describes the source of the feeder, i.e.
+        :param code: The source of the feeder (a string).
+        :param filename: A string that describes the source of the feeder, i.e.
                          the filename that is being feed.
         """
         super().__init__(filename)
@@ -130,8 +130,8 @@ class FileLineFeeder(LineFeeder):
 
     def __init__(self, fileobject, trace_fn=None):
         """
-        @param: fileobject The source of the feeder (a string).
-        @param: filename   A string that describes the source of the feeder,
+        :param fileobject: The source of the feeder (a string).
+        :param filename: A string that describes the source of the feeder,
                            i.e.  the filename that is being feed.
         """
         super().__init__(fileobject.name)