gitlab-org--gitlab-foss/scripts/lib/glfm/parse_examples.rb

# frozen_string_literal: true

# This module contains a Ruby port of Python logic from the `get_tests` method of the
# `spec_test.py` script (see copy of original code in a comment at the bottom of this file):
# https://github.com/github/cmark-gfm/blob/5dfedc7/test/spec_tests.py#L82
#
# The logic and structure is as unchanged as possible from the original Python - no
# cleanup or refactoring was done.
#
# Changes from the original logic were made to follow Ruby/GitLab syntax, idioms, and linting rules.
#
# Additional logic was also added to:
# 1. Capture all nested headers, not just the most recent.
# 2. Raise an exception if an unexpected state is encountered.
#
# Comments indicate where changes, deletions, or additions were made.
#
# See more detailed documentation of rules regarding the handling of headers
# in the comments at the top of `Glfm::UpdateExampleSnapshots#add_example_names`,
# in `scripts/lib/glfm/update_example_snapshots.rb`
module Glfm
  module ParseExamples
    REGULAR_TEXT = 0
    MARKDOWN_EXAMPLE = 1
    HTML_OUTPUT = 2
    EXAMPLE_BACKTICKS_LENGTH = 32
    EXAMPLE_BACKTICKS_STRING = '`' * EXAMPLE_BACKTICKS_LENGTH

    # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
    def parse_examples(spec_txt_lines)
      line_number = 0
      start_line = 0
      example_number = 0
      markdown_lines = []
      html_lines = []
      state = REGULAR_TEXT # 0 regular text, 1 markdown example, 2 html output
      extensions = []
      headertext = '' # most recent header text
      headers = [] # all nested headers since last H2 - new logic compared to original Python code
      tests = []

      h1_regex = /\A# / # new logic compared to original Python code
      h2_regex = /\A## / # new logic compared to original Python code
      header_regex = /\A#+ / # Added beginning of line anchor to original Python code

      spec_txt_lines.each do |line|
        line_number += 1
        stripped_line = line.strip
        if stripped_line.start_with?("#{EXAMPLE_BACKTICKS_STRING} example")
          # If beginning line of an example block...
          state = MARKDOWN_EXAMPLE
          extensions = stripped_line[(EXAMPLE_BACKTICKS_LENGTH + " example".length)..].split
        elsif stripped_line == EXAMPLE_BACKTICKS_STRING
          # Else if end line of an example block...
          state = REGULAR_TEXT
          example_number += 1
          end_line = line_number

          # NOTE: The original implementation completely excludes disabled examples, but we need
          # to include them in order to correctly count the header numbering, so we set a flag
          # instead. This will need to be accounted for when we run conformance testing.

          # unless extensions.include?('disabled') # commented logic compared to original Python code
          tests <<
            {
              markdown: markdown_lines.join.tr('→', "\t"),
              html: html_lines.join.tr('→', "\t"),
              example: example_number,
              start_line: start_line,
              end_line: end_line,
              section: headertext,
              extensions: extensions,
              headers: headers.dup, # new logic compared to original Python code
              disabled: extensions.include?('disabled') # new logic compared to original Python code
            }
          # end # commented logic compared to original Python code

          start_line = 0
          markdown_lines = []
          html_lines = []
        elsif stripped_line == "."
          # Else if the example divider line...
          state = HTML_OUTPUT
          # Else if we are currently in a markdown example...
        elsif state == MARKDOWN_EXAMPLE
          start_line = line_number - 1 if start_line == 0

          markdown_lines.append(line)
        elsif state == HTML_OUTPUT
          # Else if we are currently in the html output...
          html_lines.append(line)
        elsif state == REGULAR_TEXT && line =~ header_regex
          # Else if we are in regular text and it is a header line
          # NOTE: This assumes examples are within the section under
          # Heading level 2 with Heading levels above 2 ignored

          # Extract the header text from the line
          headertext = line.gsub(header_regex, '').strip

          # The 'headers' array is new logic compared to the original Python code

          # reset the headers array if we found a new H1
          headers = [] if line =~ h1_regex

          # headers should be size 2 or less [<H1_headertext>, <H2_headertext>]
          # pop the last entry from the headers array if we are in an H2 and found a new H2
          headers.pop if headers.length == 2 && line =~ h2_regex

          # push the new header text to the headers array
          headers << headertext if line =~ h1_regex || line =~ h2_regex
        else
          # Else if we are in regular text...

          # This else block is new logic compared to original Python code

          # Sanity check for state machine
          raise 'Unexpected state encountered when parsing examples' unless state == REGULAR_TEXT

          # no-op - skips any other non-header regular text lines
        end
      end
      # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity

      tests
    end
  end
end

# Original `get_tests` method from spec_test.py:
# rubocop:disable Style/BlockComments
# rubocop:disable Style/AsciiComments
=begin
def get_tests(specfile):
    line_number = 0
    start_line = 0
    end_line = 0
    example_number = 0
    markdown_lines = []
    html_lines = []
    state = 0  # 0 regular text, 1 markdown example, 2 html output
    extensions = []
    headertext = ''
    tests = []

    header_re = re.compile('#+ ')

    with open(specfile, 'r', encoding='utf-8', newline='\n') as specf:
        for line in specf:
            line_number = line_number + 1
            l = line.strip()
            if l.startswith("`" * 32 + " example"):
                state = 1
                extensions = l[32 + len(" example"):].split()
            elif l == "`" * 32:
                state = 0
                example_number = example_number + 1
                end_line = line_number
                if 'disabled' not in extensions:
                    tests.append({
                        "markdown":''.join(markdown_lines).replace('→',"\t"),
                        "html":''.join(html_lines).replace('→',"\t"),
                        "example": example_number,
                        "start_line": start_line,
                        "end_line": end_line,
                        "section": headertext,
                        "extensions": extensions})
                start_line = 0
                markdown_lines = []
                html_lines = []
            elif l == ".":
                state = 2
            elif state == 1:
                if start_line == 0:
                    start_line = line_number - 1
                markdown_lines.append(line)
            elif state == 2:
                html_lines.append(line)
            elif state == 0 and re.match(header_re, line):
                headertext = header_re.sub('', line).strip()
    return tests

=end
# rubocop:enable Style/BlockComments
# rubocop:enable Style/AsciiComments