ruby--ruby/lib/syntax_suggest/clean_document.rb

# frozen_string_literal: true

module SyntaxSuggest
  # Parses and sanitizes source into a lexically aware document
  #
  # Internally the document is represented by an array with each
  # index containing a CodeLine correlating to a line from the source code.
  #
  # There are three main phases in the algorithm:
  #
  # 1. Sanitize/format input source
  # 2. Search for invalid blocks
  # 3. Format invalid blocks into something meaninful
  #
  # This class handles the first part.
  #
  # The reason this class exists is to format input source
  # for better/easier/cleaner exploration.
  #
  # The CodeSearch class operates at the line level so
  # we must be careful to not introduce lines that look
  # valid by themselves, but when removed will trigger syntax errors
  # or strange behavior.
  #
  # ## Join Trailing slashes
  #
  # Code with a trailing slash is logically treated as a single line:
  #
  #     1 it "code can be split" \
  #     2    "across multiple lines" do
  #
  # In this case removing line 2 would add a syntax error. We get around
  # this by internally joining the two lines into a single "line" object
  #
  # ## Logically Consecutive lines
  #
  # Code that can be broken over multiple
  # lines such as method calls are on different lines:
  #
  #     1 User.
  #     2   where(name: "schneems").
  #     3   first
  #
  # Removing line 2 can introduce a syntax error. To fix this, all lines
  # are joined into one.
  #
  # ## Heredocs
  #
  # A heredoc is an way of defining a multi-line string. They can cause many
  # problems. If left as a single line, Ripper would try to parse the contents
  # as ruby code rather than as a string. Even without this problem, we still
  # hit an issue with indentation
  #
  #    1 foo = <<~HEREDOC
  #    2  "Be yourself; everyone else is already taken.""
  #    3    ― Oscar Wilde
  #    4      puts "I look like ruby code" # but i'm still a heredoc
  #    5 HEREDOC
  #
  # If we didn't join these lines then our algorithm would think that line 4
  # is separate from the rest, has a higher indentation, then look at it first
  # and remove it.
  #
  # If the code evaluates line 5 by itself it will think line 5 is a constant,
  # remove it, and introduce a syntax errror.
  #
  # All of these problems are fixed by joining the whole heredoc into a single
  # line.
  #
  # ## Comments and whitespace
  #
  # Comments can throw off the way the lexer tells us that the line
  # logically belongs with the next line. This is valid ruby but
  # results in a different lex output than before:
  #
  #     1 User.
  #     2   where(name: "schneems").
  #     3   # Comment here
  #     4   first
  #
  # To handle this we can replace comment lines with empty lines
  # and then re-lex the source. This removal and re-lexing preserves
  # line index and document size, but generates an easier to work with
  # document.
  #
  class CleanDocument
    def initialize(source:)
      lines = clean_sweep(source: source)
      @document = CodeLine.from_source(lines.join, lines: lines)
    end

    # Call all of the document "cleaners"
    # and return self
    def call
      join_trailing_slash!
      join_consecutive!
      join_heredoc!

      self
    end

    # Return an array of CodeLines in the
    # document
    def lines
      @document
    end

    # Renders the document back to a string
    def to_s
      @document.join
    end

    # Remove comments and whitespace only lines
    #
    # replace with empty newlines
    #
    #     source = <<~'EOM'
    #       # Comment 1
    #       puts "hello"
    #       # Comment 2
    #       puts "world"
    #     EOM
    #
    #     lines = CleanDocument.new(source: source).lines
    #     expect(lines[0].to_s).to eq("\n")
    #     expect(lines[1].to_s).to eq("puts "hello")
    #     expect(lines[2].to_s).to eq("\n")
    #     expect(lines[3].to_s).to eq("puts "world")
    #
    # Important: This must be done before lexing.
    #
    # After this change is made, we lex the document because
    # removing comments can change how the doc is parsed.
    #
    # For example:
    #
    #     values = LexAll.new(source: <<~EOM))
    #       User.
    #         # comment
    #         where(name: 'schneems')
    #     EOM
    #     expect(
    #       values.count {|v| v.type == :on_ignored_nl}
    #     ).to eq(1)
    #
    # After the comment is removed:
    #
    #     values = LexAll.new(source: <<~EOM))
    #       User.
    #
    #         where(name: 'schneems')
    #     EOM
    #     expect(
    #      values.count {|v| v.type == :on_ignored_nl}
    #    ).to eq(2)
    #
    def clean_sweep(source:)
      source.lines.map do |line|
        if line.match?(/^\s*(#[^{].*)?$/) # https://rubular.com/r/LLE10D8HKMkJvs
          $/
        else
          line
        end
      end
    end

    # Smushes all heredoc lines into one line
    #
    #     source = <<~'EOM'
    #       foo = <<~HEREDOC
    #          lol
    #          hehehe
    #       HEREDOC
    #     EOM
    #
    #     lines = CleanDocument.new(source: source).join_heredoc!.lines
    #     expect(lines[0].to_s).to eq(source)
    #     expect(lines[1].to_s).to eq("")
    def join_heredoc!
      start_index_stack = []
      heredoc_beg_end_index = []
      lines.each do |line|
        line.lex.each do |lex_value|
          case lex_value.type
          when :on_heredoc_beg
            start_index_stack << line.index
          when :on_heredoc_end
            start_index = start_index_stack.pop
            end_index = line.index
            heredoc_beg_end_index << [start_index, end_index]
          end
        end
      end

      heredoc_groups = heredoc_beg_end_index.map { |start_index, end_index| @document[start_index..end_index] }

      join_groups(heredoc_groups)
      self
    end

    # Smushes logically "consecutive" lines
    #
    #     source = <<~'EOM'
    #       User.
    #         where(name: 'schneems').
    #         first
    #     EOM
    #
    #     lines = CleanDocument.new(source: source).join_consecutive!.lines
    #     expect(lines[0].to_s).to eq(source)
    #     expect(lines[1].to_s).to eq("")
    #
    # The one known case this doesn't handle is:
    #
    #     Ripper.lex <<~EOM
    #       a &&
    #        b ||
    #        c
    #     EOM
    #
    # For some reason this introduces `on_ignore_newline` but with BEG type
    #
    def join_consecutive!
      consecutive_groups = @document.select(&:ignore_newline_not_beg?).map do |code_line|
        take_while_including(code_line.index..-1) do |line|
          line.ignore_newline_not_beg?
        end
      end

      join_groups(consecutive_groups)
      self
    end

    # Join lines with a trailing slash
    #
    #     source = <<~'EOM'
    #       it "code can be split" \
    #          "across multiple lines" do
    #     EOM
    #
    #     lines = CleanDocument.new(source: source).join_consecutive!.lines
    #     expect(lines[0].to_s).to eq(source)
    #     expect(lines[1].to_s).to eq("")
    def join_trailing_slash!
      trailing_groups = @document.select(&:trailing_slash?).map do |code_line|
        take_while_including(code_line.index..-1) { |x| x.trailing_slash? }
      end
      join_groups(trailing_groups)
      self
    end

    # Helper method for joining "groups" of lines
    #
    # Input is expected to be type Array<Array<CodeLine>>
    #
    # The outer array holds the various "groups" while the
    # inner array holds code lines.
    #
    # All code lines are "joined" into the first line in
    # their group.
    #
    # To preserve document size, empty lines are placed
    # in the place of the lines that were "joined"
    def join_groups(groups)
      groups.each do |lines|
        line = lines.first

        # Handle the case of multiple groups in a a row
        # if one is already replaced, move on
        next if @document[line.index].empty?

        # Join group into the first line
        @document[line.index] = CodeLine.new(
          lex: lines.map(&:lex).flatten,
          line: lines.join,
          index: line.index
        )

        # Hide the rest of the lines
        lines[1..-1].each do |line|
          # The above lines already have newlines in them, if add more
          # then there will be double newline, use an empty line instead
          @document[line.index] = CodeLine.new(line: "", index: line.index, lex: [])
        end
      end
      self
    end

    # Helper method for grabbing elements from document
    #
    # Like `take_while` except when it stops
    # iterating, it also returns the line
    # that caused it to stop
    def take_while_including(range = 0..-1)
      take_next_and_stop = false
      @document[range].take_while do |line|
        next if take_next_and_stop

        take_next_and_stop = !(yield line)
        true
      end
    end
  end
end
Sync SyntaxSuggest ``` $ tool/sync_default_gems.rb syntax_suggest ``` 2022-07-26 16:21:09 -04:00			`# frozen_string_literal: true`

			`module SyntaxSuggest`
			`# Parses and sanitizes source into a lexically aware document`
			`#`
			`# Internally the document is represented by an array with each`
			`# index containing a CodeLine correlating to a line from the source code.`
			`#`
			`# There are three main phases in the algorithm:`
			`#`
			`# 1. Sanitize/format input source`
			`# 2. Search for invalid blocks`
			`# 3. Format invalid blocks into something meaninful`
			`#`
			`# This class handles the first part.`
			`#`
			`# The reason this class exists is to format input source`
			`# for better/easier/cleaner exploration.`
			`#`
			`# The CodeSearch class operates at the line level so`
			`# we must be careful to not introduce lines that look`
			`# valid by themselves, but when removed will trigger syntax errors`
			`# or strange behavior.`
			`#`
			`# ## Join Trailing slashes`
			`#`
			`# Code with a trailing slash is logically treated as a single line:`
			`#`
			`# 1 it "code can be split" \`
			`# 2 "across multiple lines" do`
			`#`
			`# In this case removing line 2 would add a syntax error. We get around`
			`# this by internally joining the two lines into a single "line" object`
			`#`
			`# ## Logically Consecutive lines`
			`#`
			`# Code that can be broken over multiple`
			`# lines such as method calls are on different lines:`
			`#`
			`# 1 User.`
			`# 2 where(name: "schneems").`
			`# 3 first`
			`#`
			`# Removing line 2 can introduce a syntax error. To fix this, all lines`
			`# are joined into one.`
			`#`
			`# ## Heredocs`
			`#`
			`# A heredoc is an way of defining a multi-line string. They can cause many`
			`# problems. If left as a single line, Ripper would try to parse the contents`
			`# as ruby code rather than as a string. Even without this problem, we still`
			`# hit an issue with indentation`
			`#`
			`# 1 foo = <<~HEREDOC`
			`# 2 "Be yourself; everyone else is already taken.""`
			`# 3 ― Oscar Wilde`
			`# 4 puts "I look like ruby code" # but i'm still a heredoc`
			`# 5 HEREDOC`
			`#`
			`# If we didn't join these lines then our algorithm would think that line 4`
			`# is separate from the rest, has a higher indentation, then look at it first`
			`# and remove it.`
			`#`
			`# If the code evaluates line 5 by itself it will think line 5 is a constant,`
			`# remove it, and introduce a syntax errror.`
			`#`
			`# All of these problems are fixed by joining the whole heredoc into a single`
			`# line.`
			`#`
			`# ## Comments and whitespace`
			`#`
			`# Comments can throw off the way the lexer tells us that the line`
			`# logically belongs with the next line. This is valid ruby but`
			`# results in a different lex output than before:`
			`#`
			`# 1 User.`
			`# 2 where(name: "schneems").`
			`# 3 # Comment here`
			`# 4 first`
			`#`
			`# To handle this we can replace comment lines with empty lines`
			`# and then re-lex the source. This removal and re-lexing preserves`
			`# line index and document size, but generates an easier to work with`
			`# document.`
			`#`
			`class CleanDocument`
			`def initialize(source:)`
			`lines = clean_sweep(source: source)`
			`@document = CodeLine.from_source(lines.join, lines: lines)`
			`end`

			`# Call all of the document "cleaners"`
			`# and return self`
			`def call`
			`join_trailing_slash!`
			`join_consecutive!`
			`join_heredoc!`

			`self`
			`end`

			`# Return an array of CodeLines in the`
			`# document`
			`def lines`
			`@document`
			`end`

			`# Renders the document back to a string`
			`def to_s`
			`@document.join`
			`end`

			`# Remove comments and whitespace only lines`
			`#`
			`# replace with empty newlines`
			`#`
			`# source = <<~'EOM'`
			`# # Comment 1`
			`# puts "hello"`
			`# # Comment 2`
			`# puts "world"`
			`# EOM`
			`#`
			`# lines = CleanDocument.new(source: source).lines`
			`# expect(lines[0].to_s).to eq("\n")`
			`# expect(lines[1].to_s).to eq("puts "hello")`
			`# expect(lines[2].to_s).to eq("\n")`
			`# expect(lines[3].to_s).to eq("puts "world")`
			`#`
			`# Important: This must be done before lexing.`
			`#`
			`# After this change is made, we lex the document because`
			`# removing comments can change how the doc is parsed.`
			`#`
			`# For example:`
			`#`
			`# values = LexAll.new(source: <<~EOM))`
			`# User.`
			`# # comment`
			`# where(name: 'schneems')`
			`# EOM`
			`# expect(`
			`# values.count {\|v\| v.type == :on_ignored_nl}`
			`# ).to eq(1)`
			`#`
			`# After the comment is removed:`
			`#`
			`# values = LexAll.new(source: <<~EOM))`
			`# User.`
			`#`
			`# where(name: 'schneems')`
			`# EOM`
			`# expect(`
			`# values.count {\|v\| v.type == :on_ignored_nl}`
			`# ).to eq(2)`
			`#`
			`def clean_sweep(source:)`
			`source.lines.map do \|line\|`
			`if line.match?(/^\s(#[^{].)?$/) # https://rubular.com/r/LLE10D8HKMkJvs`
			`$/`
			`else`
			`line`
			`end`
			`end`
			`end`

			`# Smushes all heredoc lines into one line`
			`#`
			`# source = <<~'EOM'`
			`# foo = <<~HEREDOC`
			`# lol`
			`# hehehe`
			`# HEREDOC`
			`# EOM`
			`#`
			`# lines = CleanDocument.new(source: source).join_heredoc!.lines`
			`# expect(lines[0].to_s).to eq(source)`
			`# expect(lines[1].to_s).to eq("")`
			`def join_heredoc!`
			`start_index_stack = []`
			`heredoc_beg_end_index = []`
			`lines.each do \|line\|`
			`line.lex.each do \|lex_value\|`
			`case lex_value.type`
			`when :on_heredoc_beg`
			`start_index_stack << line.index`
			`when :on_heredoc_end`
			`start_index = start_index_stack.pop`
			`end_index = line.index`
			`heredoc_beg_end_index << [start_index, end_index]`
			`end`
			`end`
			`end`

			`heredoc_groups = heredoc_beg_end_index.map { \|start_index, end_index\| @document[start_index..end_index] }`

			`join_groups(heredoc_groups)`
			`self`
			`end`

			`# Smushes logically "consecutive" lines`
			`#`
			`# source = <<~'EOM'`
			`# User.`
			`# where(name: 'schneems').`
			`# first`
			`# EOM`
			`#`
			`# lines = CleanDocument.new(source: source).join_consecutive!.lines`
			`# expect(lines[0].to_s).to eq(source)`
			`# expect(lines[1].to_s).to eq("")`
			`#`
			`# The one known case this doesn't handle is:`
			`#`
			`# Ripper.lex <<~EOM`
			`# a &&`
			`# b \|\|`
			`# c`
			`# EOM`
			`#`
			# For some reason this introduces `on_ignore_newline` but with BEG type
			`#`
			`def join_consecutive!`
			`consecutive_groups = @document.select(&:ignore_newline_not_beg?).map do \|code_line\|`
			`take_while_including(code_line.index..-1) do \|line\|`
			`line.ignore_newline_not_beg?`
			`end`
			`end`

			`join_groups(consecutive_groups)`
			`self`
			`end`

			`# Join lines with a trailing slash`
			`#`
			`# source = <<~'EOM'`
			`# it "code can be split" \`
			`# "across multiple lines" do`
			`# EOM`
			`#`
			`# lines = CleanDocument.new(source: source).join_consecutive!.lines`
			`# expect(lines[0].to_s).to eq(source)`
			`# expect(lines[1].to_s).to eq("")`
			`def join_trailing_slash!`
			`trailing_groups = @document.select(&:trailing_slash?).map do \|code_line\|`
			`take_while_including(code_line.index..-1) { \|x\| x.trailing_slash? }`
			`end`
			`join_groups(trailing_groups)`
			`self`
			`end`

			`# Helper method for joining "groups" of lines`
			`#`
			`# Input is expected to be type Array<Array<CodeLine>>`
			`#`
			`# The outer array holds the various "groups" while the`
			`# inner array holds code lines.`
			`#`
			`# All code lines are "joined" into the first line in`
			`# their group.`
			`#`
			`# To preserve document size, empty lines are placed`
			`# in the place of the lines that were "joined"`
			`def join_groups(groups)`
			`groups.each do \|lines\|`
			`line = lines.first`

			`# Handle the case of multiple groups in a a row`
			`# if one is already replaced, move on`
			`next if @document[line.index].empty?`

			`# Join group into the first line`
			`@document[line.index] = CodeLine.new(`
			`lex: lines.map(&:lex).flatten,`
			`line: lines.join,`
			`index: line.index`
			`)`

			`# Hide the rest of the lines`
			`lines[1..-1].each do \|line\|`
			`# The above lines already have newlines in them, if add more`
			`# then there will be double newline, use an empty line instead`
			`@document[line.index] = CodeLine.new(line: "", index: line.index, lex: [])`
			`end`
			`end`
			`self`
			`end`

			`# Helper method for grabbing elements from document`
			`#`
			# Like `take_while` except when it stops
			`# iterating, it also returns the line`
			`# that caused it to stop`
			`def take_while_including(range = 0..-1)`
			`take_next_and_stop = false`
			`@document[range].take_while do \|line\|`
			`next if take_next_and_stop`

			`take_next_and_stop = !(yield line)`
			`true`
			`end`
			`end`
			`end`
			`end`