Flex (lexer) - matching unicode

Question

Is there a way to get flex to match unicode along the lines of

ascSymbol     !|#|$|%|&|⋆|+|.|/|<|=|>|?|@|\|^|-|~|:
uniSymbol     \p{Symbol}|\p{Other_Symbol}|\p{Punctuation}
symbol        ascSymbol|uniSymbol{-}[^|_"',;]

I found http://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html via Flex(lexer) support for unicode but I'd want to be able to something in an automated way.

For example, I'm using cmake and it is configured to generate the lexer/parser on build from the *.l and *.y files. I'd ideally want a work around that didn't require GHC or another Haskell compiler to be installed.

Also open to suggestions for another lexer that integrates with Bison and has unicode support....

I don't think there is a way to do this other than to compile a (long) regular expression from the list of desired UTF-8 codes. Doing that by hand would be a nuisance, but it wouldn't be *too* hard to write in Python, for example. But the result would be tied to the Unicode database at the time of scanner generation, so you'd need to regenerate the scanner every time the UCD changed. — rici, Mar 08 '15 at 18:53
hmm, this is starting to look like a painful task. Don't fancy generating a massive file for flex from everything in http://www.unicode.org/Public/UCD/latest/ucd/ wouldn't know until I've tried but it sounds terribly inefficient. Looking at whether I can get what I want done with [Ragel](http://www.colm.net/files/ragel/ragel-guide-6.9.pdf) — zcourts, Mar 09 '15 at 22:22

zcourts · Accepted Answer · 2015-03-26T13:00:33.453

As it turns out getting unicode support in Flex would be a pain unless the Flex source itself adds it. There seems to be some experimental stuff in there for unicode but never made it into a release that I can find.

The Ragel doc is insightful, and comes with built in support for Unicode. I've since found this article which gives an example of how to make Ragel and C++ play nice. Seems to be the better option so going with that.

Hopefully this saves someone else the time it took to figure this out.

EDIT

"Built in support" as stated above is perhaps an exaggeration. It has been easier to get unicode support but it's not just out of the box kind of thing. Using cmake I'm generating a state machine from the derived UCD 7 file. In CMakeLists.txt I do:

#Ruby is required to generate a unicode Ragel machine
FIND_PACKAGE(Ruby REQUIRED)
MESSAGE("Found Ruby ${RUBY_VERSION}")
SET(UNICODE_MACHINE_PATH "${PROJECT_SOURCE_DIR}/src/unicode.rl")
if(NOT EXISTS ${UNICODE_MACHINE_PATH} OR gen_unicode)

MESSAGE("Attempting to generate unicode state machine")
EXECUTE_PROCESS(COMMAND ${RUBY_EXECUTABLE}  ${PROJECT_SOURCE_DIR}/unicode2ragel.rb
                OUTPUT_FILE ${UNICODE_MACHINE_PATH}
                RESULT_VARIABLE RAGEL_UNICODE_GEN_RES)

  if(${RAGEL_UNICODE_GEN_RES} EQUAL 0)
    MESSAGE("Generaged Ragel Unicode state machine")
  else()
    MESSAGE(SEND_ERROR "Unable to generate unicode state machine")
  endif()
endif()

Then in unicode2ragel.rb (ships with Ragel and modified slightly for UCD 7)

#!/usr/bin/env ruby
#
# This script uses the unicode spec to generate a Ragel state machine
# that recognizes unicode alphanumeric characters.  It generates 5
# character classes: uupper, ulower, ualpha, udigit, and ualnum.
# Currently supported encodings are UTF-8 [default] and UCS-4.
#
# Usage: unicode2ragel.rb [options]
#    -e, --encoding [ucs4 | utf8]     Data encoding
#    -h, --help                       Show this message
#
# This script was originally written as part of the Ferret search
# engine library.
#
# Author: Rakan El-Khalil <rakan@well.com>

require 'optparse'
require 'open-uri'

ENCODINGS = [ :utf8, :ucs4 ]
ALPHTYPES = { :utf8 => "unsigned char", :ucs4 => "unsigned int" }
CHART_URL = "http://www.unicode.org/Public/7.0.0/ucd/extracted/DerivedGeneralCategory.txt"#"http://www.unicode.org/Public/7.0.0/ucd/DerivedCoreProperties.txt"

###
# Display vars & default option

TOTAL_WIDTH = 80
RANGE_WIDTH = 23
@encoding = :utf8

###
# Option parsing

cli_opts = OptionParser.new do |opts|
  opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o|
    @encoding = o.downcase.to_sym
  end
  opts.on("-h", "--help", "Show this message") do
    puts opts
    exit
  end
end

cli_opts.parse(ARGV)
unless ENCODINGS.member? @encoding
  puts "Invalid encoding: #{@encoding}"
  puts cli_opts
  exit
end

##
# Downloads the document at url and yields every alpha line's hex
# range and description.

def each_alpha( url, property )
  open( url ) do |file|
    file.each_line do |line|
      next if line =~ /^#/;
      next if line !~ /; #{property} #/;

      range, description = line.split(/;/)
      range.strip!
      description.gsub!(/.*#/, '').strip!

      if range =~ /\.\./
           start, stop = range.split '..'
      else start = stop = range
      end

      yield start.hex .. stop.hex, description
    end
  end
end

###
# Formats to hex at minimum width

def to_hex( n )
  r = "%0X" % n
  r = "0#{r}" unless (r.length % 2).zero?
  r
end

###
# UCS4 is just a straight hex conversion of the unicode codepoint.

def to_ucs4( range )
  rangestr  =   "0x" + to_hex(range.begin)
  rangestr << "..0x" + to_hex(range.end) if range.begin != range.end
  [ rangestr ]
end

##
# 0x00     - 0x7f     -> 0zzzzzzz[7]
# 0x80     - 0x7ff    -> 110yyyyy[5] 10zzzzzz[6]
# 0x800    - 0xffff   -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6]
# 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6]

UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff]

def to_utf8_enc( n )
  r = 0
  if n <= 0x7f
    r = n
  elsif n <= 0x7ff
    y = 0xc0 | (n >> 6)
    z = 0x80 | (n & 0x3f)
    r = y << 8 | z
  elsif n <= 0xffff
    x = 0xe0 | (n >> 12)
    y = 0x80 | (n >>  6) & 0x3f
    z = 0x80 |  n        & 0x3f
    r = x << 16 | y << 8 | z
  elsif n <= 0x10ffff
    w = 0xf0 | (n >> 18)
    x = 0x80 | (n >> 12) & 0x3f
    y = 0x80 | (n >>  6) & 0x3f
    z = 0x80 |  n        & 0x3f
    r = w << 24 | x << 16 | y << 8 | z
  end

  to_hex(r)
end

def from_utf8_enc( n )
  n = n.hex
  r = 0
  if n <= 0x7f
    r = n
  elsif n <= 0xdfff
    y = (n >> 8) & 0x1f
    z =  n       & 0x3f
    r = y << 6 | z
  elsif n <= 0xefffff
    x = (n >> 16) & 0x0f
    y = (n >>  8) & 0x3f
    z =  n        & 0x3f
    r = x << 10 | y << 6 | z
  elsif n <= 0xf7ffffff
    w = (n >> 24) & 0x07
    x = (n >> 16) & 0x3f
    y = (n >>  8) & 0x3f
    z =  n        & 0x3f
    r = w << 18 | x << 12 | y << 6 | z
  end
  r
end

###
# Given a range, splits it up into ranges that can be continuously
# encoded into utf8.  Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff]
# This is not strictly needed since the current [5.1] unicode standard
# doesn't have ranges that straddle utf8 boundaries.  This is included
# for completeness as there is no telling if that will ever change.

def utf8_ranges( range )
  ranges = []
  UTF8_BOUNDARIES.each do |max|
    if range.begin <= max
      return ranges << range if range.end <= max

      ranges << range.begin .. max
      range = (max + 1) .. range.end
    end
  end
  ranges
end

def build_range( start, stop )
  size = start.size/2
  left = size - 1
  return [""] if size < 1

  a = start[0..1]
  b = stop[0..1]

  ###
  # Shared prefix

  if a == b
    return build_range(start[2..-1], stop[2..-1]).map do |elt|
      "0x#{a} " + elt
    end
  end

  ###
  # Unshared prefix, end of run

  return ["0x#{a}..0x#{b} "] if left.zero?

  ###
  # Unshared prefix, not end of run
  # Range can be 0x123456..0x56789A
  # Which is equivalent to:
  #     0x123456 .. 0x12FFFF
  #     0x130000 .. 0x55FFFF
  #     0x560000 .. 0x56789A

  ret = []
  ret << build_range(start, a + "FF" * left)

  ###
  # Only generate middle range if need be.

  if a.hex+1 != b.hex
    max = to_hex(b.hex - 1)
    max = "FF" if b == "FF"
    ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left
  end

  ###
  # Don't generate last range if it is covered by first range

  ret << build_range(b + "00" * left, stop) unless b == "FF"
  ret.flatten!
end

def to_utf8( range )
  utf8_ranges( range ).map do |r|
    build_range to_utf8_enc(r.begin), to_utf8_enc(r.end)
  end.flatten!
end

##
# Perform a 3-way comparison of the number of codepoints advertised by
# the unicode spec for the given range, the originally parsed range,
# and the resulting utf8 encoded range.

def count_codepoints( code )
  code.split(' ').inject(1) do |acc, elt|
    if elt =~ /0x(.+)\.\.0x(.+)/
      if @encoding == :utf8
        acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1)
      else
        acc * ($2.hex - $1.hex + 1)
      end
    else
      acc
    end
  end
end

def is_valid?( range, desc, codes )
  spec_count  = 1
  spec_count  = $1.to_i if desc =~ /\[(\d+)\]/
  range_count = range.end - range.begin + 1

  sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) }
  sum == spec_count and sum == range_count
end

##
# Generate the state maching to stdout

def generate_machine( name, property )
  pipe = " "
  puts "    #{name} = "
  each_alpha( CHART_URL, property ) do |range, desc|

    codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range)

    raise "Invalid encoding of range #{range}: #{codes.inspect}" unless
      is_valid? range, desc, codes

    range_width = codes.map { |a| a.size }.max
    range_width = RANGE_WIDTH if range_width < RANGE_WIDTH

    desc_width  = TOTAL_WIDTH - RANGE_WIDTH - 11
    desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH

    if desc.size > desc_width
      desc = desc[0..desc_width - 4] + "..."
    end

    codes.each_with_index do |r, idx|
      desc = "" unless idx.zero?
      code = "%-#{range_width}s" % r
      puts "      #{pipe} #{code} ##{desc}"
      pipe = "|"
    end
  end
  puts "      ;"
  puts ""
end

puts <<EOF
# The following Ragel file was autogenerated from: #{CHART_URL}
#
# It defines ualpha, udigit, ualnum.
#
# To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]},
# and that your input is in #{@encoding}.

%%{
    machine WChar;
EOF
generate_machine( :uUppercaseLetter, "Lu" )
generate_machine( :uLowercaseLetter, "Ll" )
generate_machine( :uTitlecaseLetter, "Lt" )
generate_machine( :uModifierLetter, "Lm" )
generate_machine( :uOtherLetter, "Lo" )
generate_machine( :uNonspacingMark, "Mn" )
generate_machine( :uEnclosingMark, "Me" )
generate_machine( :uSpacingMark, "Mc" )
generate_machine( :uDecimalNumber, "Nd" )
generate_machine( :uLetterNumber, "Nl" )
generate_machine( :uOtherNumber, "No" )
generate_machine( :uSpaceSeparator, "Zs" )
generate_machine( :uLineSeparator, "Zl" )
generate_machine( :uParagraphSeparator, "Zp" )
generate_machine( :uFormat, "Cf" )
generate_machine( :uPrivateUse, "Co" )
generate_machine( :uSurrogate, "Cs" )
generate_machine( :uDashPunctuation, "Pd" )
generate_machine( :uOpenPunctuation, "Ps" )
generate_machine( :uClosePunctuation, "Pe" )
generate_machine( :uConnectorPunctuation, "Pc" )
generate_machine( :uOtherPunctuation, "Po" )
generate_machine( :uMathSymbol, "Sm" )
generate_machine( :uCurrencySymbol, "Sc" )
generate_machine( :uModifierSymbol, "Sk" )
generate_machine( :uOtherSymbol, "So" )
generate_machine( :uInitialPunctuation, "Pi" )
generate_machine( :uFinalPunctuation, "Pf" )
puts <<EOF
}%%
EOF

Then in your ragel machine files you can include unicode.rl and get access to each of the unicode groups defined e.g uUppercaseLetter and so on...

score 0 · Answer 2 · answered Mar 24 '17 at 13:12

"Also open to suggestions for another lexer that integrates with Bison and has unicode support...."

The RE/flex project offers a Flex-compatible lexical analyzer generator for C++ that supports Unicode and works with Bison.

It accepts your example (a bit modified, to fix syntax):

%option unicode
ascSymbol     [!#$%&⋆+./<=>?@\\^\-~:]
uniSymbol     [\p{Symbol}\p{Other_Symbol}\p{Punctuation}]{-}[\^|_"',;]
symbol        {ascSymbol}|{uniSymbol}

Flex (lexer) - matching unicode

2 Answers2