apparmor/parser/libapparmor_re/expr-tree.h
John Johansen 846cee5066 Split out parsing and expression trees from regexp.y
Start of splitting regexp.y into logical components instead of the mess
it is today.  Split out the expr-tree and parsing components from regexp.y
int expr-tree.x and parse.y and since regexp.y no longer does parsing
rename it to hfa.cc

Some code cleanups snuck their way into this patch and since I am to
lazy to redo it, I have left them in.

Signed-off-by: John Johansen <john.johansen@canonical.com>
Acked-By: Steve Beattie <sbeattie@ubuntu.com>
2011-03-13 05:46:29 -07:00

627 lines
14 KiB
C++

/*
* (C) 2006, 2007 Andreas Gruenbacher <agruen@suse.de>
* Copyright (c) 2003-2008 Novell, Inc. (All rights reserved)
* Copyright 2009-2010 Canonical Ltd.
*
* The libapparmor library is licensed under the terms of the GNU
* Lesser General Public License, version 2.1. Please see the file
* COPYING.LGPL.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*
* Functions to create/manipulate an expression tree for regular expressions
* that have been parsed.
*
* The expression tree can be used directly after the parse creates it, or
* it can be factored so that the set of important nodes is smaller.
* Having a reduced set of important nodes generally results in a dfa that
* is closer to minimum (fewer redundant states are created). It also
* results in fewer important nodes in a the state set during subset
* construction resulting in less memory used to create a dfa.
*
* Generally it is worth doing expression tree simplification before dfa
* construction, if the regular expression tree contains any alternations.
* Even if the regular expression doesn't simplification should be fast
* enough that it can be used with minimal overhead.
*/
#ifndef __LIBAA_RE_EXPR_H
#define __LIBAA_RE_EXPR_H
#include <map>
#include <set>
#include <stack>
#include <ostream>
#include "apparmor_re.h"
using namespace std;
typedef unsigned char uchar;
typedef set<uchar> Chars;
ostream& operator<<(ostream& os, uchar c);
/* Compute the union of two sets. */
template<class T>
set<T> operator+(const set<T>& a, const set<T>& b)
{
set<T> c(a);
c.insert(b.begin(), b.end());
return c;
}
/**
* When creating DFAs from regex trees, a DFA state is constructed from
* a set of important nodes in the syntax tree. This includes AcceptNodes,
* which indicate that when a match ends in a particular state, the
* regular expressions that the AcceptNode belongs to match.
*/
class Node;
class ImportantNode;
typedef set <ImportantNode *> NodeSet;
/**
* Text-dump a state (for debugging).
*/
ostream& operator<<(ostream& os, const NodeSet& state);
/**
* Out-edges from a state to another: we store the follow-set of Nodes
* for each input character that is not a default match in
* cases (i.e., following a CharNode or CharSetNode), and default
* matches in otherwise as well as in all matching explicit cases
* (i.e., following an AnyCharNode or NotCharSetNode). This avoids
* enumerating all the explicit tranitions for default matches.
*/
typedef struct NodeCases {
typedef map<uchar, NodeSet *>::iterator iterator;
iterator begin() { return cases.begin(); }
iterator end() { return cases.end(); }
NodeCases() : otherwise(0) { }
map<uchar, NodeSet *> cases;
NodeSet *otherwise;
} NodeCases;
ostream& operator<<(ostream& os, Node& node);
/* An abstract node in the syntax tree. */
class Node {
public:
Node() :
nullable(false) { child[0] = child[1] = 0; }
Node(Node *left) :
nullable(false) { child[0] = left; child[1] = 0; }
Node(Node *left, Node *right) :
nullable(false) { child[0] = left; child[1] = right; }
virtual ~Node()
{
if (child[0])
child[0]->release();
if (child[1])
child[1]->release();
}
/**
* See the "Dragon Book" for an explanation of nullable, firstpos,
* lastpos, and followpos.
*/
virtual void compute_nullable() { }
virtual void compute_firstpos() = 0;
virtual void compute_lastpos() = 0;
virtual void compute_followpos() { }
virtual int eq(Node *other) = 0;
virtual ostream& dump(ostream& os) = 0;
void dump_syntax_tree(ostream& os);
bool nullable;
NodeSet firstpos, lastpos, followpos;
/* child 0 is left, child 1 is right */
Node *child[2];
unsigned int label; /* unique number for debug etc */
/**
* We indirectly release Nodes through a virtual function because
* accept and Eps Nodes are shared, and must be treated specially.
* We could use full reference counting here but the indirect release
* is sufficient and has less overhead
*/
virtual void release(void) { delete this; }
};
class InnerNode : public Node {
public:
InnerNode() : Node() { };
InnerNode(Node *left) : Node(left) {};
InnerNode(Node *left, Node *right) : Node(left, right) { };
};
class OneChildNode : public InnerNode {
public:
OneChildNode(Node *left) : InnerNode(left) { };
};
class TwoChildNode : public InnerNode {
public:
TwoChildNode(Node *left, Node *right) : InnerNode(left, right) { };
};
class LeafNode : public Node {
public:
LeafNode() : Node() { };
};
/* Match nothing (//). */
class EpsNode : public LeafNode {
public:
EpsNode() : LeafNode()
{
nullable = true;
label = 0;
}
void release(void)
{
/* don't delete Eps nodes because there is a single static
* instance shared by all trees. Look for epsnode in the code
*/
}
void compute_firstpos() { }
void compute_lastpos() { }
int eq(Node *other)
{
if (dynamic_cast<EpsNode *>(other))
return 1;
return 0;
}
ostream& dump(ostream& os)
{
return os << "[]";
}
};
/**
* Leaf nodes in the syntax tree are important to us: they describe the
* characters that the regular expression matches. We also consider
* AcceptNodes import: they indicate when a regular expression matches.
*/
class ImportantNode : public LeafNode {
public:
ImportantNode() : LeafNode() { }
void compute_firstpos()
{
firstpos.insert(this);
}
void compute_lastpos() {
lastpos.insert(this);
}
virtual void follow(NodeCases& cases) = 0;
};
/* common base class for all the different classes that contain
* character information.
*/
class CNode : public ImportantNode {
public:
CNode() : ImportantNode() { }
};
/* Match one specific character (/c/). */
class CharNode : public CNode {
public:
CharNode(uchar c) : c(c) { }
void follow(NodeCases& cases)
{
NodeSet **x = &cases.cases[c];
if (!*x) {
if (cases.otherwise)
*x = new NodeSet(*cases.otherwise);
else
*x = new NodeSet;
}
(*x)->insert(followpos.begin(), followpos.end());
}
int eq(Node *other)
{
CharNode *o = dynamic_cast<CharNode *>(other);
if (o) {
return c == o->c;
}
return 0;
}
ostream& dump(ostream& os)
{
return os << c;
}
uchar c;
};
/* Match a set of characters (/[abc]/). */
class CharSetNode : public CNode {
public:
CharSetNode(Chars& chars) : chars(chars) { }
void follow(NodeCases& cases)
{
for (Chars::iterator i = chars.begin(); i != chars.end(); i++) {
NodeSet **x = &cases.cases[*i];
if (!*x) {
if (cases.otherwise)
*x = new NodeSet(*cases.otherwise);
else
*x = new NodeSet;
}
(*x)->insert(followpos.begin(), followpos.end());
}
}
int eq(Node *other)
{
CharSetNode *o = dynamic_cast<CharSetNode *>(other);
if (!o || chars.size() != o->chars.size())
return 0;
for (Chars::iterator i = chars.begin(), j = o->chars.begin();
i != chars.end() && j != o->chars.end();
i++, j++) {
if (*i != *j)
return 0;
}
return 1;
}
ostream& dump(ostream& os)
{
os << '[';
for (Chars::iterator i = chars.begin(); i != chars.end(); i++)
os << *i;
return os << ']';
}
Chars chars;
};
/* Match all except one character (/[^abc]/). */
class NotCharSetNode : public CNode {
public:
NotCharSetNode(Chars& chars) : chars(chars) { }
void follow(NodeCases& cases)
{
if (!cases.otherwise)
cases.otherwise = new NodeSet;
for (Chars::iterator j = chars.begin(); j != chars.end(); j++) {
NodeSet **x = &cases.cases[*j];
if (!*x)
*x = new NodeSet(*cases.otherwise);
}
/* Note: Add to the nonmatching characters after copying away
* the old otherwise state for the matching characters.
*/
cases.otherwise->insert(followpos.begin(), followpos.end());
for (NodeCases::iterator i = cases.begin(); i != cases.end();
i++) {
if (chars.find(i->first) == chars.end())
i->second->insert(followpos.begin(),
followpos.end());
}
}
int eq(Node *other)
{
NotCharSetNode *o = dynamic_cast<NotCharSetNode *>(other);
if (!o || chars.size() != o->chars.size())
return 0;
for (Chars::iterator i = chars.begin(), j = o->chars.begin();
i != chars.end() && j != o->chars.end();
i++, j++) {
if (*i != *j)
return 0;
}
return 1;
}
ostream& dump(ostream& os)
{
os << "[^";
for (Chars::iterator i = chars.begin(); i != chars.end(); i++)
os << *i;
return os << ']';
}
Chars chars;
};
/* Match any character (/./). */
class AnyCharNode : public CNode {
public:
AnyCharNode() { }
void follow(NodeCases& cases)
{
if (!cases.otherwise)
cases.otherwise = new NodeSet;
cases.otherwise->insert(followpos.begin(), followpos.end());
for (NodeCases::iterator i = cases.begin(); i != cases.end();
i++)
i->second->insert(followpos.begin(), followpos.end());
}
int eq(Node *other)
{
if (dynamic_cast<AnyCharNode *>(other))
return 1;
return 0;
}
ostream& dump(ostream& os) {
return os << ".";
}
};
/**
* Indicate that a regular expression matches. An AcceptNode itself
* doesn't match anything, so it will never generate any transitions.
*/
class AcceptNode : public ImportantNode {
public:
AcceptNode() {}
void release(void)
{
/* don't delete AcceptNode via release as they are shared, and
* will be deleted when the table the are stored in is deleted
*/
}
void follow(NodeCases& cases __attribute__((unused)))
{
/* Nothing to follow. */
}
/* requires accept nodes to be common by pointer */
int eq(Node *other)
{
if (dynamic_cast<AcceptNode *>(other))
return (this == other);
return 0;
}
};
/* Match a node zero or more times. (This is a unary operator.) */
class StarNode : public OneChildNode {
public:
StarNode(Node *left) : OneChildNode(left)
{
nullable = true;
}
void compute_firstpos()
{
firstpos = child[0]->firstpos;
}
void compute_lastpos()
{
lastpos = child[0]->lastpos;
}
void compute_followpos()
{
NodeSet from = child[0]->lastpos, to = child[0]->firstpos;
for(NodeSet::iterator i = from.begin(); i != from.end(); i++) {
(*i)->followpos.insert(to.begin(), to.end());
}
}
int eq(Node *other) {
if (dynamic_cast<StarNode *>(other))
return child[0]->eq(other->child[0]);
return 0;
}
ostream& dump(ostream& os)
{
os << '(';
child[0]->dump(os);
return os << ")*";
}
};
/* Match a node one or more times. (This is a unary operator.) */
class PlusNode : public OneChildNode {
public:
PlusNode(Node *left) : OneChildNode(left) { }
void compute_nullable()
{
nullable = child[0]->nullable;
}
void compute_firstpos()
{
firstpos = child[0]->firstpos;
}
void compute_lastpos()
{
lastpos = child[0]->lastpos;
}
void compute_followpos()
{
NodeSet from = child[0]->lastpos, to = child[0]->firstpos;
for(NodeSet::iterator i = from.begin(); i != from.end(); i++) {
(*i)->followpos.insert(to.begin(), to.end());
}
}
int eq(Node *other)
{
if (dynamic_cast<PlusNode *>(other))
return child[0]->eq(other->child[0]);
return 0;
}
ostream& dump(ostream& os)
{
os << '(';
child[0]->dump(os);
return os << ")+";
}
};
/* Match a pair of consecutive nodes. */
class CatNode : public TwoChildNode {
public:
CatNode(Node *left, Node *right) : TwoChildNode(left, right) { }
void compute_nullable()
{
nullable = child[0]->nullable && child[1]->nullable;
}
void compute_firstpos()
{
if (child[0]->nullable)
firstpos = child[0]->firstpos + child[1]->firstpos;
else
firstpos = child[0]->firstpos;
}
void compute_lastpos()
{
if (child[1]->nullable)
lastpos = child[0]->lastpos + child[1]->lastpos;
else
lastpos = child[1]->lastpos;
}
void compute_followpos()
{
NodeSet from = child[0]->lastpos, to = child[1]->firstpos;
for(NodeSet::iterator i = from.begin(); i != from.end(); i++) {
(*i)->followpos.insert(to.begin(), to.end());
}
}
int eq(Node *other) {
if (dynamic_cast<CatNode *>(other)) {
if (!child[0]->eq(other->child[0]))
return 0;
return child[1]->eq(other->child[1]);
}
return 0;
}
ostream& dump(ostream& os)
{
child[0]->dump(os);
child[1]->dump(os);
return os;
}
};
/* Match one of two alternative nodes. */
class AltNode : public TwoChildNode {
public:
AltNode(Node *left, Node *right) : TwoChildNode(left, right) { }
void compute_nullable()
{
nullable = child[0]->nullable || child[1]->nullable;
}
void compute_lastpos()
{
lastpos = child[0]->lastpos + child[1]->lastpos;
}
void compute_firstpos()
{
firstpos = child[0]->firstpos + child[1]->firstpos;
}
int eq(Node *other) {
if (dynamic_cast<AltNode *>(other)) {
if (!child[0]->eq(other->child[0]))
return 0;
return child[1]->eq(other->child[1]);
}
return 0;
}
ostream& dump(ostream& os)
{
os << '(';
child[0]->dump(os);
os << '|';
child[1]->dump(os);
os << ')';
return os;
}
};
/* Traverse the syntax tree depth-first in an iterator-like manner. */
class depth_first_traversal {
stack<Node *> pos;
void push_left(Node *node)
{
pos.push(node);
while (dynamic_cast<InnerNode *>(node)) {
pos.push(node->child[0]);
node = node->child[0];
}
}
public:
depth_first_traversal(Node *node)
{
push_left(node);
}
Node *operator*()
{
return pos.top();
}
Node* operator->()
{
return pos.top();
}
operator bool()
{
return !pos.empty();
}
void operator++(int)
{
Node *last = pos.top();
pos.pop();
if (!pos.empty()) {
/* no need to dynamic cast, as we just popped a node so
* the top node must be an inner node */
InnerNode *node = (InnerNode *)(pos.top());
if (node->child[1] && node->child[1] != last) {
push_left(node->child[1]);
}
}
}
};
struct node_counts {
int charnode;
int charset;
int notcharset;
int alt;
int plus;
int star;
int any;
int cat;
};
extern EpsNode epsnode;
int debug_tree(Node *t);
Node *simplify_tree(Node *t, dfaflags_t flags);
void label_nodes(Node *root);
unsigned long hash_NodeSet(NodeSet *ns);
/* Comparison operator for sets of <NodeSet *>.
* Compare set hashes, and if the sets have the same hash
* do compare pointer comparison on set of <Node *>, the pointer comparison
* allows us to determine which Sets of <Node *> we have seen already from
* new ones when constructing the DFA.
*/
struct deref_less_than {
bool operator()(pair <unsigned long, NodeSet *> const & lhs,
pair <unsigned long, NodeSet *> const & rhs) const
{
if (lhs.first == rhs.first)
return *(lhs.second) < *(rhs.second);
else
return lhs.first < rhs.first;
}
};
#endif /* __LIBAA_RE_EXPR */