----------------------------------------------------------------------------- -- A representation of "compositional" graphs for Thompson's -- construction of NFAs from regular expressions. -- This version corresponds to the description in the paper. ----------------------------------------------------------------------------- import List import Assertion -- for testing -- A graph is a list of nodes and edges: data Graph = Graph [Node] [Edge] -- We use natural numbers for node identifiers: type NodeId = Int -- A node is identified by a node identifier: data Node = Node NodeId -- An edge consists of a source and a target node which are -- identified by their node identifiers and a marking: data Edge = Edge NodeId (Maybe Char) NodeId -- Add two (disjoint) graphs: joinGraphs :: Graph -> Graph -> Graph joinGraphs (Graph ns1 es1) (Graph ns2 es2) = Graph (ns1++ns2) (es1++es2) -- Add a specified number of new nodes to a graph and return the graph -- together with the list of new nodeids: addNodes :: Int -> Graph -> (Graph,[NodeId]) addNodes n (Graph ns es) = (Graph (map Node newnodes ++ ns) es, newnodes) where newvars i = if i<=0 then [] else x : newvars (i-1) where x free newnodes = newvars n -- Add a list of new edges to a graph: addEdges :: [Edge] -> Graph -> Graph addEdges edges (Graph ns es) = Graph ns (es++edges) -- Compute the outgoing edges (i.e., target nodes) in a graph -- starting from a node with a particular marking: outgoingEdges :: Graph -> NodeId -> Maybe Char -> [NodeId] outgoingEdges (Graph _ edges) node mark = outgoing edges where outgoing [] = [] outgoing (Edge n1 m n2 : es) = if n1==node && m==mark then n2 : outgoing es else outgoing es -- A graph is finalized for further processing if all nodes -- have uniquely assigned node identifiers: isFinalizedGraph :: Graph -> Success isFinalizedGraph (Graph nodes _) = numberNodes 1 nodes where numberNodes _ [] = success numberNodes n (Node ni : ns) | ni =:= n -- assign unique identifier = numberNodes (n+1) ns ------------------------------------------------------------------------- -- Representation of regular expressions: data RegExp a = Empty -- empty word | Symbol a -- a single symbol | Alt (RegExp a) (RegExp a) -- alternative | Conc (RegExp a) (RegExp a) -- concatenation | Star (RegExp a) -- repetition -- Translation of a regular expression into non-deterministic automata -- where the transition relation is represented as a graph -- (Thompson's construction): regExp2graph :: RegExp Char -> (NodeId,Graph,NodeId) -- the empty word is translated into a graph with a single epsilon edge: regExp2graph Empty = (s, Graph [Node s, Node e] [Edge s Nothing e], e) where s,e free -- an atomic symbol is translated into a graph with a single edge marked -- with this symbol: regExp2graph (Symbol c) = (s, Graph [Node s, Node e] [Edge s (Just c) e], e) where s,e free -- the alternative is translated by combining the graphs of the -- alternatives with epsilon transitions: regExp2graph (Alt re1 re2) = (s, addEdges [Edge s Nothing s1, Edge s Nothing s2, Edge e1 Nothing e, Edge e2 Nothing e] g12, e) where (s1,g1,e1) = regExp2graph re1 (s2,g2,e2) = regExp2graph re2 (g12,[s,e]) = addNodes 2 (joinGraphs g1 g2) -- the concatentation is translated by combining the graphs of the -- components sequentially (with an epsilon transition): regExp2graph (Conc re1 re2) = (s1, addEdges [Edge e1 Nothing s2] (joinGraphs g1 g2), e2) where (s1,g1,e1) = regExp2graph re1 (s2,g2,e2) = regExp2graph re2 -- the star operator is translated by putting some epsilons transitions -- around the component graph: regExp2graph (Star re1) = (s, addEdges [Edge s Nothing e, Edge e Nothing s1, Edge e1 Nothing e] g1s, e) where (s1,g1,e1) = regExp2graph re1 (g1s,[s,e]) = addNodes 2 g1 ------------------------------------------------------------------------- -- Solving the word problem for languages described by regular expressions. -- -- For simplicity, we use the NFA approach where we compute epsilon closures -- of NFA states on the fly. -- Compute the epsilon closure for a list of nodes: e_closure :: Graph -> [NodeId] -> [NodeId] e_closure graph init_nodes = closure init_nodes init_nodes where closure nodes [] = nodes closure nodes (n:ns) = let newnodes = filter (`notElem` nodes) (outgoingEdges graph n Nothing) in closure (newnodes++nodes) (newnodes++ns) -- Check whether a string is member of the language described by the -- regular expression: check :: RegExp Char -> String -> Bool check re cs | isFinalizedGraph g -- make sure to instantiate all nodeids = scan (e_closure g [s]) g e cs where (s,g,e) = regExp2graph re scan state _ final [] = final `elem` state scan state graph final (c:cs) = scan (e_closure graph newnodes) graph final cs where newnodes = foldr union [] (map (\n->outgoingEdges graph n (Just c)) state) ------------------------------------------------------------------------- -- ...and now some simple tests: -- Here are some examples for regular expressions: -- a digit digit_re = foldr1 Alt (map (\i->Symbol (chr (ord '0' + i))) [0..9]) -- a natural number num_re = Conc (digit_re) (Star digit_re) -- a*b* asbs = Conc (Star (Symbol 'a')) (Star (Symbol 'b')) -- (a(cd)*b)* acdsbs = Star (Conc (Symbol 'a') (Conc (Star (Conc (Symbol 'c') (Symbol 'd'))) (Symbol 'b'))) -- ...and now we check them: -- (execute shell command "currytest thompson") main1 = AssertEqual "num" (check num_re "135" ) True main2 = AssertEqual "asbs1" (check asbs "aabbb" ) True main3 = AssertEqual "asbs2" (check asbs "aaabbab" ) False main4 = AssertEqual "asbs3" (check (Conc asbs asbs) "baa") True main5 = AssertEqual "acd1" (check acdsbs "abcd" ) False main6 = AssertEqual "acd2" (check acdsbs "abacdbabab" ) True main7 = AssertEqual "empty" (check (Star Empty) "" ) True