# frozen_string_literal: true
# The Lexer is responsible for turning source text into tokens.
# This version is a performance enhanced lexer (in comparison to the 3.x and earlier "future parser" lexer.
#
# Old returns tokens [:KEY, value, { locator = }
# Could return [[token], locator]
# or Token.new([token], locator) with the same API x[0] = token_symbol, x[1] = self, x[:key] = (:value, :file, :line, :pos) etc
require 'strscan'
require 'puppet/pops/parser/lexer_support'
require 'puppet/pops/parser/heredoc_support'
require 'puppet/pops/parser/interpolation_support'
require 'puppet/pops/parser/epp_support'
require 'puppet/pops/parser/slurp_support'
module Puppet::Pops
module Parser
class Lexer2
include LexerSupport
include HeredocSupport
include InterpolationSupport
include SlurpSupport
include EppSupport
# ALl tokens have three slots, the token name (a Symbol), the token text (String), and a token text length.
# All operator and punctuation tokens reuse singleton arrays Tokens that require unique values create
# a unique array per token.
#
# PEFORMANCE NOTES:
# This construct reduces the amount of object that needs to be created for operators and punctuation.
# The length is pre-calculated for all singleton tokens. The length is used both to signal the length of
# the token, and to advance the scanner position (without having to advance it with a scan(regexp)).
#
TOKEN_LBRACK = [:LBRACK, '[', 1].freeze
TOKEN_LISTSTART = [:LISTSTART, '[', 1].freeze
TOKEN_RBRACK = [:RBRACK, ']', 1].freeze
TOKEN_LBRACE = [:LBRACE, '{', 1].freeze
TOKEN_RBRACE = [:RBRACE, '}', 1].freeze
TOKEN_SELBRACE = [:SELBRACE, '{', 1].freeze
TOKEN_LPAREN = [:LPAREN, '(', 1].freeze
TOKEN_WSLPAREN = [:WSLPAREN, '(', 1].freeze
TOKEN_RPAREN = [:RPAREN, ')', 1].freeze
TOKEN_EQUALS = [:EQUALS, '=', 1].freeze
TOKEN_APPENDS = [:APPENDS, '+=', 2].freeze
TOKEN_DELETES = [:DELETES, '-=', 2].freeze
TOKEN_ISEQUAL = [:ISEQUAL, '==', 2].freeze
TOKEN_NOTEQUAL = [:NOTEQUAL, '!=', 2].freeze
TOKEN_MATCH = [:MATCH, '=~', 2].freeze
TOKEN_NOMATCH = [:NOMATCH, '!~', 2].freeze
TOKEN_GREATEREQUAL = [:GREATEREQUAL, '>=', 2].freeze
TOKEN_GREATERTHAN = [:GREATERTHAN, '>', 1].freeze
TOKEN_LESSEQUAL = [:LESSEQUAL, '<=', 2].freeze
TOKEN_LESSTHAN = [:LESSTHAN, '<', 1].freeze
TOKEN_FARROW = [:FARROW, '=>', 2].freeze
TOKEN_PARROW = [:PARROW, '+>', 2].freeze
TOKEN_LSHIFT = [:LSHIFT, '<<', 2].freeze
TOKEN_LLCOLLECT = [:LLCOLLECT, '<<|', 3].freeze
TOKEN_LCOLLECT = [:LCOLLECT, '<|', 2].freeze
TOKEN_RSHIFT = [:RSHIFT, '>>', 2].freeze
TOKEN_RRCOLLECT = [:RRCOLLECT, '|>>', 3].freeze
TOKEN_RCOLLECT = [:RCOLLECT, '|>', 2].freeze
TOKEN_PLUS = [:PLUS, '+', 1].freeze
TOKEN_MINUS = [:MINUS, '-', 1].freeze
TOKEN_DIV = [:DIV, '/', 1].freeze
TOKEN_TIMES = [:TIMES, '*', 1].freeze
TOKEN_MODULO = [:MODULO, '%', 1].freeze
TOKEN_NOT = [:NOT, '!', 1].freeze
TOKEN_DOT = [:DOT, '.', 1].freeze
TOKEN_PIPE = [:PIPE, '|', 1].freeze
TOKEN_AT = [:AT , '@', 1].freeze
TOKEN_ATAT = [:ATAT , '@@', 2].freeze
TOKEN_COLON = [:COLON, ':', 1].freeze
TOKEN_COMMA = [:COMMA, ',', 1].freeze
TOKEN_SEMIC = [:SEMIC, ';', 1].freeze
TOKEN_QMARK = [:QMARK, '?', 1].freeze
TOKEN_TILDE = [:TILDE, '~', 1].freeze # lexed but not an operator in Puppet
TOKEN_REGEXP = [:REGEXP, nil, 0].freeze
TOKEN_IN_EDGE = [:IN_EDGE, '->', 2].freeze
TOKEN_IN_EDGE_SUB = [:IN_EDGE_SUB, '~>', 2].freeze
TOKEN_OUT_EDGE = [:OUT_EDGE, '<-', 2].freeze
TOKEN_OUT_EDGE_SUB = [:OUT_EDGE_SUB, '<~', 2].freeze
# Tokens that are always unique to what has been lexed
TOKEN_STRING = [:STRING, nil, 0].freeze
TOKEN_WORD = [:WORD, nil, 0].freeze
TOKEN_DQPRE = [:DQPRE, nil, 0].freeze
TOKEN_DQMID = [:DQPRE, nil, 0].freeze
TOKEN_DQPOS = [:DQPRE, nil, 0].freeze
TOKEN_NUMBER = [:NUMBER, nil, 0].freeze
TOKEN_VARIABLE = [:VARIABLE, nil, 1].freeze
TOKEN_VARIABLE_EMPTY = [:VARIABLE, '', 1].freeze
# HEREDOC has syntax as an argument.
TOKEN_HEREDOC = [:HEREDOC, nil, 0].freeze
# EPP_START is currently a marker token, may later get syntax
TOKEN_EPPSTART = [:EPP_START, nil, 0].freeze
TOKEN_EPPEND = [:EPP_END, '%>', 2].freeze
TOKEN_EPPEND_TRIM = [:EPP_END_TRIM, '-%>', 3].freeze
# This is used for unrecognized tokens, will always be a single character. This particular instance
# is not used, but is kept here for documentation purposes.
TOKEN_OTHER = [:OTHER, nil, 0]
# Keywords are all singleton tokens with pre calculated lengths.
# Booleans are pre-calculated (rather than evaluating the strings "false" "true" repeatedly.
#
KEYWORDS = {
'case' => [:CASE, 'case', 4],
'class' => [:CLASS, 'class', 5],
'default' => [:DEFAULT, 'default', 7],
'define' => [:DEFINE, 'define', 6],
'if' => [:IF, 'if', 2],
'elsif' => [:ELSIF, 'elsif', 5],
'else' => [:ELSE, 'else', 4],
'inherits' => [:INHERITS, 'inherits', 8],
'node' => [:NODE, 'node', 4],
'and' => [:AND, 'and', 3],
'or' => [:OR, 'or', 2],
'undef' => [:UNDEF, 'undef', 5],
'false' => [:BOOLEAN, false, 5],
'true' => [:BOOLEAN, true, 4],
'in' => [:IN, 'in', 2],
'unless' => [:UNLESS, 'unless', 6],
'function' => [:FUNCTION, 'function', 8],
'type' => [:TYPE, 'type', 4],
'attr' => [:ATTR, 'attr', 4],
'private' => [:PRIVATE, 'private', 7],
'application' => [:APPLICATION, 'application', 11],
'consumes' => [:CONSUMES, 'consumes', 8],
'produces' => [:PRODUCES, 'produces', 8],
'site' => [:SITE, 'site', 4],
}
KEYWORDS.each {|k,v| v[1].freeze; v.freeze }
KEYWORDS.freeze
# Reverse lookup of keyword name to string
KEYWORD_NAMES = {}
KEYWORDS.each {|k, v| KEYWORD_NAMES[v[0]] = k }
KEYWORD_NAMES.freeze
PATTERN_WS = %r{[[:blank:]\r]+}
PATTERN_NON_WS = %r{\w+\b?}
# The single line comment includes the line ending.
PATTERN_COMMENT = %r{#.*\r?}
PATTERN_MLCOMMENT = %r{/\*(.*?)\*/}m
PATTERN_REGEX = %r{/[^/]*/}
PATTERN_REGEX_END = %r{/}
PATTERN_REGEX_A = %r{\A/} # for replacement to ""
PATTERN_REGEX_Z = %r{/\Z} # for replacement to ""
PATTERN_REGEX_ESC = %r{\\/} # for replacement to "/"
# The 3x patterns:
# PATTERN_CLASSREF = %r{((::){0,1}[A-Z][-\w]*)+}
# PATTERN_NAME = %r{((::)?[a-z0-9][-\w]*)(::[a-z0-9][-\w]*)*}
# The NAME and CLASSREF in 4x are strict. Each segment must start with
# a letter a-z and may not contain dashes (\w includes letters, digits and _).
#
PATTERN_CLASSREF = %r{((::){0,1}[A-Z][\w]*)+}
PATTERN_NAME = %r{^((::)?[a-z][\w]*)(::[a-z][\w]*)*$}
PATTERN_BARE_WORD = %r{((?:::){0,1}(?:[a-z_](?:[\w-]*[\w])?))+}
PATTERN_DOLLAR_VAR = %r{\$(::)?(\w+::)*\w+}
PATTERN_NUMBER = %r{\b(?:0[xX][0-9A-Fa-f]+|0?\d+(?:\.\d+)?(?:[eE]-?\d+)?)\b}
# PERFORMANCE NOTE:
# Comparison against a frozen string is faster (than unfrozen).
#
STRING_BSLASH_SLASH = '\/'.freeze
attr_reader :locator
def initialize()
@selector = {
'.' => lambda { emit(TOKEN_DOT, @scanner.pos) },
',' => lambda { emit(TOKEN_COMMA, @scanner.pos) },
'[' => lambda do
before = @scanner.pos
# Must check the preceding character to see if it is whitespace.
# The fastest thing to do is to simply byteslice to get the string ending at the offset before
# and then check what the last character is. (This is the same as what an locator.char_offset needs
# to compute, but with less overhead of trying to find out the global offset from a local offset in the
# case when this is sublocated in a heredoc).
if before == 0 || @scanner.string.byteslice(0, before)[-1] =~ /[[:blank:]\r\n]+/
emit(TOKEN_LISTSTART, before)
else
emit(TOKEN_LBRACK, before)
end
end,
']' => lambda { emit(TOKEN_RBRACK, @scanner.pos) },
'(' => lambda do
before = @scanner.pos
# If first on a line, or only whitespace between start of line and '('
# then the token is special to avoid being taken as start of a call.
line_start = @lexing_context[:line_lexical_start]
if before == line_start || @scanner.string.byteslice(line_start, before - line_start) =~ /\A[[:blank:]\r]+\Z/
emit(TOKEN_WSLPAREN, before)
else
emit(TOKEN_LPAREN, before)
end
end,
')' => lambda { emit(TOKEN_RPAREN, @scanner.pos) },
';' => lambda { emit(TOKEN_SEMIC, @scanner.pos) },
'?' => lambda { emit(TOKEN_QMARK, @scanner.pos) },
'*' => lambda { emit(TOKEN_TIMES, @scanner.pos) },
'%' => lambda do
scn = @scanner
before = scn.pos
la = scn.peek(2)
if la[1] == '>' && @lexing_context[:epp_mode]
scn.pos += 2
if @lexing_context[:epp_mode] == :expr
enqueue_completed(TOKEN_EPPEND, before)
end
@lexing_context[:epp_mode] = :text
interpolate_epp
else
emit(TOKEN_MODULO, before)
end
end,
'{' => lambda do
# The lexer needs to help the parser since the technology used cannot deal with
# lookahead of same token with different precedence. This is solved by making left brace
# after ? into a separate token.
#
@lexing_context[:brace_count] += 1
emit(if @lexing_context[:after] == :QMARK
TOKEN_SELBRACE
else
TOKEN_LBRACE
end, @scanner.pos)
end,
'}' => lambda do
@lexing_context[:brace_count] -= 1
emit(TOKEN_RBRACE, @scanner.pos)
end,
# TOKENS @, @@, @(
'@' => lambda do
scn = @scanner
la = scn.peek(2)
if la[1] == '@'
emit(TOKEN_ATAT, scn.pos) # TODO; Check if this is good for the grammar
elsif la[1] == '('
heredoc
else
emit(TOKEN_AT, scn.pos)
end
end,
# TOKENS |, |>, |>>
'|' => lambda do
scn = @scanner
la = scn.peek(3)
emit(la[1] == '>' ? (la[2] == '>' ? TOKEN_RRCOLLECT : TOKEN_RCOLLECT) : TOKEN_PIPE, scn.pos)
end,
# TOKENS =, =>, ==, =~
'=' => lambda do
scn = @scanner
la = scn.peek(2)
emit(case la[1]
when '='
TOKEN_ISEQUAL
when '>'
TOKEN_FARROW
when '~'
TOKEN_MATCH
else
TOKEN_EQUALS
end, scn.pos)
end,
# TOKENS '+', '+=', and '+>'
'+' => lambda do
scn = @scanner
la = scn.peek(2)
emit(case la[1]
when '='
TOKEN_APPENDS
when '>'
TOKEN_PARROW
else
TOKEN_PLUS
end, scn.pos)
end,
# TOKENS '-', '->', and epp '-%>' (end of interpolation with trim)
'-' => lambda do
scn = @scanner
la = scn.peek(3)
before = scn.pos
if @lexing_context[:epp_mode] && la[1] == '%' && la[2] == '>'
scn.pos += 3
if @lexing_context[:epp_mode] == :expr
enqueue_completed(TOKEN_EPPEND_TRIM, before)
end
interpolate_epp(:with_trim)
else
emit(case la[1]
when '>'
TOKEN_IN_EDGE
when '='
TOKEN_DELETES
else
TOKEN_MINUS
end, before)
end
end,
# TOKENS !, !=, !~
'!' => lambda do
scn = @scanner
la = scn.peek(2)
emit(case la[1]
when '='
TOKEN_NOTEQUAL
when '~'
TOKEN_NOMATCH
else
TOKEN_NOT
end, scn.pos)
end,
# TOKENS ~>, ~
'~' => lambda do
scn = @scanner
la = scn.peek(2)
emit(la[1] == '>' ? TOKEN_IN_EDGE_SUB : TOKEN_TILDE, scn.pos)
end,
'#' => lambda { @scanner.skip(PATTERN_COMMENT); nil },
# TOKENS '/', '/*' and '/ regexp /'
'/' => lambda do
scn = @scanner
la = scn.peek(2)
if la[1] == '*'
lex_error(Issues::UNCLOSED_MLCOMMENT) if scn.skip(PATTERN_MLCOMMENT).nil?
nil
else
before = scn.pos
# regexp position is a regexp, else a div
value = scn.scan(PATTERN_REGEX) if regexp_acceptable?
if value
# Ensure an escaped / was not matched
while escaped_end(value)
more = scn.scan_until(PATTERN_REGEX_END)
return emit(TOKEN_DIV, before) unless more
value << more
end
regex = value.sub(PATTERN_REGEX_A, '').sub(PATTERN_REGEX_Z, '').gsub(PATTERN_REGEX_ESC, '/')
emit_completed([:REGEX, Regexp.new(regex), scn.pos-before], before)
else
emit(TOKEN_DIV, before)
end
end
end,
# TOKENS <, <=, <|, <<|, <<, <-, <~
'<' => lambda do
scn = @scanner
la = scn.peek(3)
emit(case la[1]
when '<'
if la[2] == '|'
TOKEN_LLCOLLECT
else
TOKEN_LSHIFT
end
when '='
TOKEN_LESSEQUAL
when '|'
TOKEN_LCOLLECT
when '-'
TOKEN_OUT_EDGE
when '~'
TOKEN_OUT_EDGE_SUB
else
TOKEN_LESSTHAN
end, scn.pos)
end,
# TOKENS >, >=, >>
'>' => lambda do
scn = @scanner
la = scn.peek(2)
emit(case la[1]
when '>'
TOKEN_RSHIFT
when '='
TOKEN_GREATEREQUAL
else
TOKEN_GREATERTHAN
end, scn.pos)
end,
# TOKENS :, ::CLASSREF, ::NAME
':' => lambda do
scn = @scanner
la = scn.peek(3)
before = scn.pos
if la[1] == ':'
# PERFORMANCE NOTE: This could potentially be speeded up by using a case/when listing all
# upper case letters. Alternatively, the 'A', and 'Z' comparisons may be faster if they are
# frozen.
#
la2 = la[2]
if la2 >= 'A' && la2 <= 'Z'
# CLASSREF or error
value = scn.scan(PATTERN_CLASSREF)
if value && scn.peek(2) != '::'
after = scn.pos
emit_completed([:CLASSREF, value.freeze, after-before], before)
else
# move to faulty position ('::<uc-letter>' was ok)
scn.pos = scn.pos + 3
lex_error(Issues::ILLEGAL_FULLY_QUALIFIED_CLASS_REFERENCE)
end
else
value = scn.scan(PATTERN_BARE_WORD)
if value
if value =~ PATTERN_NAME
emit_completed([:NAME, value.freeze, scn.pos - before], before)
else
emit_completed([:WORD, value.freeze, scn.pos - before], before)
end
else
# move to faulty position ('::' was ok)
scn.pos = scn.pos + 2
lex_error(Issues::ILLEGAL_FULLY_QUALIFIED_NAME)
end
end
else
emit(TOKEN_COLON, before)
end
end,
'$' => lambda do
scn = @scanner
before = scn.pos
value = scn.scan(PATTERN_DOLLAR_VAR)
if value
emit_completed([:VARIABLE, value[1..-1].freeze, scn.pos - before], before)
else
# consume the $ and let higher layer complain about the error instead of getting a syntax error
emit(TOKEN_VARIABLE_EMPTY, before)
end
end,
'"' => lambda do
# Recursive string interpolation, 'interpolate' either returns a STRING token, or
# a DQPRE with the rest of the string's tokens placed in the @token_queue
interpolate_dq
end,
"'" => lambda do
scn = @scanner
before = scn.pos
emit_completed([:STRING, slurp_sqstring.freeze, scn.pos - before], before)
end,
"\n" => lambda do
# If heredoc_cont is in effect there are heredoc text lines to skip over
# otherwise just skip the newline.
#
ctx = @lexing_context
if ctx[:newline_jump]
@scanner.pos = ctx[:newline_jump]
ctx[:newline_jump] = nil
else
@scanner.pos += 1
end
ctx[:line_lexical_start] = @scanner.pos
nil
end,
'' => lambda { nil } # when the peek(1) returns empty
}
[ ' ', "\t", "\r" ].each { |c| @selector[c] = lambda { @scanner.skip(PATTERN_WS); nil } }
[ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'].each do |c|
@selector[c] = lambda do
scn = @scanner
before = scn.pos
value = scn.scan(PATTERN_NUMBER)
if value
length = scn.pos - before
assert_numeric(value, before)
emit_completed([:NUMBER, value.freeze, length], before)
else
invalid_number = scn.scan_until(PATTERN_NON_WS)
if before > 1
after = scn.pos
scn.pos = before - 1
if scn.peek(1) == '.'
# preceded by a dot. Is this a bad decimal number then?
scn.pos = before - 2
while scn.peek(1) =~ /^\d$/
invalid_number = nil
before = scn.pos
break if before == 0
scn.pos = scn.pos - 1
end
end
scn.pos = before
invalid_number = scn.peek(after - before) unless invalid_number
end
assert_numeric(invalid_number, before)
scn.pos = before + 1
lex_error(Issues::ILLEGAL_NUMBER, {:value => invalid_number})
end
end
end
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '_'].each do |c|
@selector[c] = lambda do
scn = @scanner
before = scn.pos
value = scn.scan(PATTERN_BARE_WORD)
if value && value =~ PATTERN_NAME
emit_completed(KEYWORDS[value] || @taskm_keywords[value] || [:NAME, value.freeze, scn.pos - before], before)
elsif value
emit_completed([:WORD, value.freeze, scn.pos - before], before)
else
# move to faulty position ([a-z_] was ok)
scn.pos = scn.pos + 1
fully_qualified = scn.match?(/::/)
if fully_qualified
lex_error(Issues::ILLEGAL_FULLY_QUALIFIED_NAME)
else
lex_error(Issues::ILLEGAL_NAME_OR_BARE_WORD)
end
end
end
end
['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'].each do |c|
@selector[c] = lambda do
scn = @scanner
before = scn.pos
value = @scanner.scan(PATTERN_CLASSREF)
if value && @scanner.peek(2) != '::'
emit_completed([:CLASSREF, value.freeze, scn.pos - before], before)
else
# move to faulty position ([A-Z] was ok)
scn.pos = scn.pos + 1
lex_error(Issues::ILLEGAL_CLASS_REFERENCE)
end
end
end
@selector.default = lambda do
# In case of unicode spaces of various kinds that are captured by a regexp, but not by the
# simpler case expression above (not worth handling those special cases with better performance).
scn = @scanner
if scn.skip(PATTERN_WS)
nil
else
# "unrecognized char"
emit([:OTHER, scn.peek(0), 1], scn.pos)
end
end
@selector.each { |k,v| k.freeze }
@selector.freeze
end
# Determine if last char of value is escaped by a backslash
def escaped_end(value)
escaped = false
if value.end_with?(STRING_BSLASH_SLASH)
value[1...-1].each_codepoint do |cp|
if cp == 0x5c # backslash
escaped = !escaped
else
escaped = false
end
end
end
escaped
end
# Clears the lexer state (it is not required to call this as it will be garbage collected
# and the next lex call (lex_string, lex_file) will reset the internal state.
#
def clear()
# not really needed, but if someone wants to ensure garbage is collected as early as possible
@scanner = nil
@locator = nil
@lexing_context = nil
end
# Convenience method, and for compatibility with older lexer. Use the lex_string instead which allows
# passing the path to use without first having to call file= (which reads the file if it exists).
# (Bad form to use overloading of assignment operator for something that is not really an assignment. Also,
# overloading of = does not allow passing more than one argument).
#
def string=(string)
lex_string(string, nil)
end
def lex_string(string, path=nil)
initvars
assert_not_bom(string)
@scanner = StringScanner.new(string)
@locator = Locator.locator(string, path)
end
# Lexes an unquoted string.
# @param string [String] the string to lex
# @param locator [Locator] the locator to use (a default is used if nil is given)
# @param escapes [Array<String>] array of character strings representing the escape sequences to transform
# @param interpolate [Boolean] whether interpolation of expressions should be made or not.
#
def lex_unquoted_string(string, locator, escapes, interpolate)
initvars
assert_not_bom(string)
@scanner = StringScanner.new(string)
@locator = locator || Locator.locator(string, '')
@lexing_context[:escapes] = escapes || UQ_ESCAPES
@lexing_context[:uq_slurp_pattern] = interpolate ? (escapes.include?('$') ? SLURP_UQ_PATTERN : SLURP_UQNE_PATTERN) : SLURP_ALL_PATTERN
end
# Convenience method, and for compatibility with older lexer. Use the lex_file instead.
# (Bad form to use overloading of assignment operator for something that is not really an assignment).
#
def file=(file)
lex_file(file)
end
# TODO: This method should not be used, callers should get the locator since it is most likely required to
# compute line, position etc given offsets.
#
def file
@locator ? @locator.file : nil
end
# Initializes lexing of the content of the given file. An empty string is used if the file does not exist.
#
def lex_file(file)
initvars
contents = Puppet::FileSystem.exist?(file) ? Puppet::FileSystem.read(file, :mode => 'rb', :encoding => 'utf-8') : ''
assert_not_bom(contents)
@scanner = StringScanner.new(contents.freeze)
@locator = Locator.locator(contents, file)
end
def initvars
@token_queue = []
# NOTE: additional keys are used; :escapes, :uq_slurp_pattern, :newline_jump, :epp_*
@lexing_context = {
:brace_count => 0,
:after => nil,
:line_lexical_start => 0
}
# Use of --tasks introduces the new keyword 'plan'
@taskm_keywords = Puppet[:tasks] ? { 'plan' => [:PLAN, 'plan', 4], 'apply' => [:APPLY, 'apply', 5] }.freeze : EMPTY_HASH
end
# Scans all of the content and returns it in an array
# Note that the terminating [false, false] token is included in the result.
#
def fullscan
result = []
scan {|token| result.push(token) }
result
end
# A block must be passed to scan. It will be called with two arguments, a symbol for the token,
# and an instance of LexerSupport::TokenValue
# PERFORMANCE NOTE: The TokenValue is designed to reduce the amount of garbage / temporary data
# and to only convert the lexer's internal tokens on demand. It is slightly more costly to create an
# instance of a class defined in Ruby than an Array or Hash, but the gain is much bigger since transformation
# logic is avoided for many of its members (most are never used (e.g. line/pos information which is only of
# value in general for error messages, and for some expressions (which the lexer does not know about).
#
def scan
# PERFORMANCE note: it is faster to access local variables than instance variables.
# This makes a small but notable difference since instance member access is avoided for
# every token in the lexed content.
#
scn = @scanner
lex_error_without_pos(Issues::NO_INPUT_TO_LEXER) unless scn
ctx = @lexing_context
queue = @token_queue
selector = @selector
scn.skip(PATTERN_WS)
# This is the lexer's main loop
until queue.empty? && scn.eos? do
token = queue.shift || selector[scn.peek(1)].call
if token
ctx[:after] = token[0]
yield token
end
end
# Signals end of input
yield [false, false]
end
# This lexes one token at the current position of the scanner.
# PERFORMANCE NOTE: Any change to this logic should be performance measured.
#
def lex_token
@selector[@scanner.peek(1)].call
end
# Emits (produces) a token [:tokensymbol, TokenValue] and moves the scanner's position past the token
#
def emit(token, byte_offset)
@scanner.pos = byte_offset + token[2]
[token[0], TokenValue.new(token, byte_offset, @locator)]
end
# Emits the completed token on the form [:tokensymbol, TokenValue. This method does not alter
# the scanner's position.
#
def emit_completed(token, byte_offset)
[token[0], TokenValue.new(token, byte_offset, @locator)]
end
# Enqueues a completed token at the given offset
def enqueue_completed(token, byte_offset)
@token_queue << emit_completed(token, byte_offset)
end
# Allows subprocessors for heredoc etc to enqueue tokens that are tokenized by a different lexer instance
#
def enqueue(emitted_token)
@token_queue << emitted_token
end
# Answers after which tokens it is acceptable to lex a regular expression.
# PERFORMANCE NOTE:
# It may be beneficial to turn this into a hash with default value of true for missing entries.
# A case expression with literal values will however create a hash internally. Since a reference is
# always needed to the hash, this access is almost as costly as a method call.
#
def regexp_acceptable?
case @lexing_context[:after]
# Ends of (potential) R-value generating expressions
when :RPAREN, :RBRACK, :RRCOLLECT, :RCOLLECT
false
# End of (potential) R-value - but must be allowed because of case expressions
# Called out here to not be mistaken for a bug.
when :RBRACE
true
# Operands (that can be followed by DIV (even if illegal in grammar)
when :NAME, :CLASSREF, :NUMBER, :STRING, :BOOLEAN, :DQPRE, :DQMID, :DQPOST, :HEREDOC, :REGEX, :VARIABLE, :WORD
false
else
true
end
end
end
end
end
Copyright 2K16 - 2K18 Indonesian Hacker Rulez