Arc Forumnew | comments | leaders | submitlogin
Rob Pike's elegant regex matcher
4 points by akkartik 5073 days ago | discuss
http://www.cs.princeton.edu/courses/archive/spr09/cos333/beautiful.html

http://news.ycombinator.com/item?id=3117590

Here's a direct wart transliteration:

  def next(s)
    (s 1 len.s)

  def match(regex text)
    if (and !blank?.regex (iso regex.0 "^"))
      (matchhere next.regex text)
      (or (matchhere regex text)
          (if !blank?.text
            (match regex next.text)))

  def matchhere(regex text)
    (if blank?.regex
        1
      (and (iso regex.0 "$") !next.regex)
        !text
      blank?.text
        nil
      (iso regex.1 "*")
        (matchstar regex.0 (next next.regex) text)
      (or (iso regex.0 ".") (iso regex.0 text.0))
        (matchhere next.regex next.text))

  def matchstar(c regex text)
    or (matchhere regex text)
       (if (and !blank?.text (or (iso c text.0) (iso c ".")))
         (matchstar c regex next.text))
It's slow because we're doing a string-copy on every call to next. Here's a version that deals in indices to avoid that:

  def match(regex text)
    with (maxr len.regex  maxt len.text
          startmatch nil  matchhere nil  matchstar nil)
      def! startmatch(r t)
        (if (and (~iso r maxr) (iso "^" regex.r))
          (matchhere (+ r 1) t)
          (or (matchhere r t)
              (if (~iso t maxt)
                (startmatch r (+ t 1)))))

      def! matchhere(r t)
        (if (iso r maxr)
              1
            (and (iso r (- maxr 1)) (iso "$" regex.r))
              (iso t maxt)
            (iso t maxt)
              nil
            (iso "*" (regex (+ r 1)))
              (matchstar regex.r (+ r 2) t)
            (or (iso "." regex.r) (iso regex.r text.t))
              (matchhere (+ r 1) (+ t 1)))

      def! matchstar(c r t)
        (or (matchhere r t)
            (if (and (~iso t maxt) (or (iso c text.t) (iso c ".")))
              (matchstar c r (+ t 1))))

      (startmatch 0 0)
Instead of operating on indices we could simply coerce the strings to lists at the start to make next cheap (ie just cdr):

  def match(regex text)
    coerce! regex list
    coerce! text list
    if (and regex (iso car.regex "^"))
      (matchhere cdr.regex text)
      (or (matchhere regex text)
          (if !blank?.text
            (match regex cdr.text)))

  def matchhere(regex text)
    (if blank?.regex
        1
      (and (iso regex.0 "$") !cdr.regex)
        !text
      blank?.text
        nil
      (iso regex.1 "*")
        (matchstar regex.0 cddr.regex text)
      (or (iso regex.0 ".") (iso regex.0 text.0))
        (matchhere cdr.regex cdr.text))

  def matchstar(c regex text)
    or (matchhere regex text)
       (if (and !blank?.text (or (iso c text.0) (iso c ".")))
         (matchstar c regex cdr.text))
Finally, some unit tests:

  (test "empty - 1"
   :valueof (match "" "")
   :should ~be nil)

  (test "empty - 2"
   :valueof (match "" "a")
   :should ~be nil)

  (test "empty - 3"
   :valueof (match "a" "")
   :should be nil)

  (test "literal - 1"
   :valueof (match "abc" "abc")
   :should ~be nil)

  (test "literal - 2"
   :valueof (match "abc" "def")
   :should be nil)

  (test "literal - 3"
   :valueof (match "abc" "ab")
   :should be nil)

  (test "literal - 4"
   :valueof (match "ab" "abc")
   :should ~be nil)

  (test "dot - 1"
   :valueof (match "a." "abc")
   :should ~be nil)

  (test "dot - 2"
   :valueof (match "a.c" "abc")
   :should ~be nil)

  (test "dot - 3"
   :valueof (match ".c" "abc")
   :should ~be nil)

  (test "dot - 4"
   :valueof (match ".a" "abc")
   :should be nil)

  (test "star - 1"
   :valueof (match "a*" "abc")
   :should ~be nil)

  (test "star - 2"
   :valueof (match "a*b" "abc")
   :should ~be nil)

  (test "star - 3"
   :valueof (match "a*c" "abc")
   :should ~be nil)

  (test "star - 4"
   :valueof (match "a*d" "abc")
   :should be nil)

  (test "dot star - 1"
   :valueof (match "a.*b" "abc")
   :should ~be nil)

  (test "dot star - 2"
   :valueof (match "a.*c" "abc")
   :should ~be nil)

  (test "dot star - 3"
   :valueof (match ".*c" "abc")
   :should ~be nil)

  (test "caret - 1"
   :valueof (match "^a" "abc")
   :should ~be nil)

  (test "caret - 2"
   :valueof (match "^a*c" "abc")
   :should be nil)

  (test "dollar - 1"
   :valueof (match "c$" "abc")
   :should ~be nil)

  (test "dollar - 2"
   :valueof (match "b$" "abc")
   :should be nil)