apparmor/parser/libapparmor_re/expr-tree.cc
John Johansen 501e87a3f2 parser: Cleanup parser control flags, so they display as expected to user
Instead of having multiple tables, since we have room post split
of optimization and dump flags just move all the optimization and
dump flags into a common table.

We can if needed switch the flag entry size to a long in the future.

Signed-off-by: John Johansen <john.johansen@canonical.com>
2023-07-08 19:58:59 -07:00

658 lines
17 KiB
C++

/*
* (C) 2006, 2007 Andreas Gruenbacher <agruen@suse.de>
* Copyright (c) 2003-2008 Novell, Inc. (All rights reserved)
* Copyright 2009-2013 Canonical Ltd.
*
* The libapparmor library is licensed under the terms of the GNU
* Lesser General Public License, version 2.1. Please see the file
* COPYING.LGPL.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*
* Functions to create/manipulate an expression tree for regular expressions
* that have been parsed.
*
* The expression tree can be used directly after the parse creates it, or
* it can be factored so that the set of important nodes is smaller.
* Having a reduced set of important nodes generally results in a dfa that
* is closer to minimum (fewer redundant states are created). It also
* results in fewer important nodes in the state set during subset
* construction resulting in less memory used to create a dfa.
*
* Generally it is worth doing expression tree simplification before dfa
* construction, if the regular expression tree contains any alternations.
* Even if the regular expression doesn't simplification should be fast
* enough that it can be used with minimal overhead.
*/
#include <stdio.h>
#include <string.h>
#include "expr-tree.h"
#include "apparmor_re.h"
/* Use a single static EpsNode as it carries no node specific information */
EpsNode epsnode;
ostream &transchar::dump(ostream &os) const
{
const char *search = "\a\033\f\n\r\t|*+[](). ",
*replace = "aefnrt|*+[](). ", *s;
if (this->c < 0)
os << "-0x" << hex << -this->c << dec;
else if (this->c > 255)
os << "0x" << hex << this->c << dec;
else if ((s = strchr(search, this->c)) && *s != '\0')
os << '\\' << replace[s - search] << " 0x" << hex << this->c << dec;
else if (!isprint(this->c))
os << "0x" << hex << this->c << dec;
else
os << (char)this->c << " 0x" << hex << this->c << dec;
return os;
}
ostream &operator<<(ostream &os, transchar tc)
{
const char *search = "\a\033\f\n\r\t|*+[](). ",
*replace = "aefnrt|*+[](). ", *s;
short c = tc.c;
if (c < 0)
os << "\\d" << "" << tc.c;
else if ((s = strchr(search, c)) && *s != '\0')
os << '\\' << replace[s - search];
else if (!isprint(c))
os << "\\x" << hex << c << dec;
else
os << (char)c;
return os;
}
/**
* Text-dump a state (for debugging).
*/
ostream &operator<<(ostream &os, const NodeSet &state)
{
os << '{';
if (!state.empty()) {
NodeSet::iterator i = state.begin();
for (;;) {
os << (*i)->label;
if (++i == state.end())
break;
os << ',';
}
}
os << '}';
return os;
}
ostream &operator<<(ostream &os, Node &node)
{
node.dump(os);
return os;
}
/**
* hash_NodeSet - generate a hash for the Nodes in the set
*/
unsigned long hash_NodeSet(NodeSet *ns)
{
unsigned long hash = 5381;
for (NodeSet::iterator i = ns->begin(); i != ns->end(); i++) {
hash = ((hash << 5) + hash) + (unsigned long)*i;
}
return hash;
}
/**
* label_nodes - label the node positions for pretty-printing debug output
*
* TODO: separate - node labels should be separate and optional, if not
* present pretty printing should use Node address
*/
void label_nodes(Node *root)
{
int nodes = 1;
for (depth_first_traversal i(root); i; i++)
i->label = nodes++;
}
/**
* Text-dump the syntax tree (for debugging).
*/
void Node::dump_syntax_tree(ostream &os)
{
for (depth_first_traversal i(this); i; i++) {
os << i->label << '\t';
if ((*i)->child[0] == 0)
os << **i << '\t' << (*i)->followpos << endl;
else {
if ((*i)->child[1] == 0)
os << (*i)->child[0]->label << **i;
else
os << (*i)->child[0]->label << **i
<< (*i)->child[1]->label;
os << '\t' << (*i)->firstpos << (*i)->lastpos << endl;
}
}
os << endl;
}
/*
* Normalize the regex parse tree for factoring and cancellations. Normalization
* reorganizes internal (alt and cat) nodes into a fixed "normalized" form that
* simplifies factoring code, in that it produces a canonicalized form for
* the direction being normalized so that the factoring code does not have
* to consider as many cases.
*
* left normalization (dir == 0) uses these rules
* (E | a) -> (a | E)
* (a | b) | c -> a | (b | c)
* (ab)c -> a(bc)
*
* right normalization (dir == 1) uses the same rules but reversed
* (a | E) -> (E | a)
* a | (b | c) -> (a | b) | c
* a(bc) -> (ab)c
*
* Note: This is written iteratively for a given node (the top node stays
* fixed and the children are rotated) instead of recursively.
* For a given node under examination rotate over nodes from
* dir to !dir. Until no dir direction node meets the criterial.
* Then recurse to the children (which will have a different node type)
* to make sure they are normalized.
* Normalization of a child node is guaranteed to not affect the
* normalization of the parent.
*
* For cat nodes the depth first traverse order is guaranteed to be
* maintained. This is not necessary for altnodes.
*
* Eg. For left normalization
*
* |1 |1
* / \ / \
* |2 T -> a |2
* / \ / \
* |3 c b |3
* / \ / \
* a b c T
*
*/
static void rotate_node(Node *t, int dir)
{
// (a | b) | c -> a | (b | c)
// (ab)c -> a(bc)
Node *left = t->child[dir];
t->child[dir] = left->child[dir];
left->child[dir] = left->child[!dir];
left->child[!dir] = t->child[!dir];
t->child[!dir] = left;
}
/* return False if no work done */
int TwoChildNode::normalize_eps(int dir)
{
if ((&epsnode == child[dir]) &&
(&epsnode != child[!dir])) {
// (E | a) -> (a | E)
// Ea -> aE
// Test for E | (E | E) and E . (E . E) which will
// result in an infinite loop
Node *c = child[!dir];
if (c->is_type(NODE_TYPE_TWOCHILD) &&
&epsnode == c->child[dir] &&
&epsnode == c->child[!dir]) {
c->release();
c = &epsnode;
}
child[!dir] = child[dir];
child[dir] = c;
return 1;
}
return 0;
}
void CatNode::normalize(int dir)
{
for (;;) {
if (normalize_eps(dir)) {
continue;
} else if (child[dir]->is_type(NODE_TYPE_CAT)) {
// (ab)c -> a(bc)
rotate_node(this, dir);
} else {
break;
}
}
if (child[dir])
child[dir]->normalize(dir);
if (child[!dir])
child[!dir]->normalize(dir);
}
void AltNode::normalize(int dir)
{
for (;;) {
if (normalize_eps(dir)) {
continue;
} else if (child[dir]->is_type(NODE_TYPE_ALT)) {
// (a | b) | c -> a | (b | c)
rotate_node(this, dir);
} else if (child[dir]->is_type(NODE_TYPE_CHARSET) &&
child[!dir]->is_type(NODE_TYPE_CHAR)) {
// [a] | b -> b | [a]
Node *c = child[dir];
child[dir] = child[!dir];
child[!dir] = c;
} else {
break;
}
}
if (child[dir])
child[dir]->normalize(dir);
if (child[!dir])
child[!dir]->normalize(dir);
}
//charset conversion is disabled for now,
//it hinders tree optimization in some cases, so it need to be either
//done post optimization, or have extra factoring rules added
#if 0
static Node *merge_charset(Node *a, Node *b)
{
if (dynamic_cast<CharNode *>(a) && dynamic_cast<CharNode *>(b)) {
Chars chars;
chars.insert(dynamic_cast<CharNode *>(a)->c);
chars.insert(dynamic_cast<CharNode *>(b)->c);
CharSetNode *n = new CharSetNode(chars);
return n;
} else if (dynamic_cast<CharNode *>(a) &&
dynamic_cast<CharSetNode *>(b)) {
Chars *chars = &dynamic_cast<CharSetNode *>(b)->chars;
chars->insert(dynamic_cast<CharNode *>(a)->c);
return b;
} else if (dynamic_cast<CharSetNode *>(a) &&
dynamic_cast<CharSetNode *>(b)) {
Chars *from = &dynamic_cast<CharSetNode *>(a)->chars;
Chars *to = &dynamic_cast<CharSetNode *>(b)->chars;
for (Chars::iterator i = from->begin(); i != from->end(); i++)
to->insert(*i);
return b;
}
//return ???;
}
static Node *alt_to_charsets(Node *t, int dir)
{
/*
Node *first = NULL;
Node *p = t;
Node *i = t;
for (;dynamic_cast<AltNode *>(i);) {
if (dynamic_cast<CharNode *>(i->child[dir]) ||
dynamic_cast<CharNodeSet *>(i->child[dir])) {
if (!first) {
first = i;
p = i;
i = i->child[!dir];
} else {
first->child[dir] = merge_charset(first->child[dir],
i->child[dir]);
p->child[!dir] = i->child[!dir];
Node *tmp = i;
i = tmp->child[!dir];
tmp->child[!dir] = NULL;
tmp->release();
}
} else {
p = i;
i = i->child[!dir];
}
}
// last altnode of chain check other dir as well
if (first && (dynamic_cast<charNode *>(i) ||
dynamic_cast<charNodeSet *>(i))) {
}
*/
/*
if (dynamic_cast<CharNode *>(t->child[dir]) ||
dynamic_cast<CharSetNode *>(t->child[dir]))
char_test = true;
(char_test &&
(dynamic_cast<CharNode *>(i->child[dir]) ||
dynamic_cast<CharSetNode *>(i->child[dir])))) {
*/
return t;
}
#endif
static Node *basic_alt_factor(Node *t, int dir)
{
if (!t->is_type(NODE_TYPE_ALT))
return t;
if (t->child[dir]->eq(t->child[!dir])) {
// (a | a) -> a
Node *tmp = t->child[dir];
t->child[dir] = NULL;
t->release();
return tmp;
}
// (ab) | (ac) -> a(b|c)
if (t->child[dir]->is_type(NODE_TYPE_CAT) &&
t->child[!dir]->is_type(NODE_TYPE_CAT) &&
t->child[dir]->child[dir]->eq(t->child[!dir]->child[dir])) {
// (ab) | (ac) -> a(b|c)
Node *left = t->child[dir];
Node *right = t->child[!dir];
t->child[dir] = left->child[!dir];
t->child[!dir] = right->child[!dir];
right->child[!dir] = NULL;
right->release();
left->child[!dir] = t;
return left;
}
// a | (ab) -> a (E | b) -> a (b | E)
if (t->child[!dir]->is_type(NODE_TYPE_CAT) &&
t->child[dir]->eq(t->child[!dir]->child[dir])) {
Node *c = t->child[!dir];
t->child[dir]->release();
t->child[dir] = c->child[!dir];
t->child[!dir] = &epsnode;
c->child[!dir] = t;
return c;
}
// ab | (a) -> a (b | E)
if (t->child[dir]->is_type(NODE_TYPE_CAT) &&
t->child[dir]->child[dir]->eq(t->child[!dir])) {
Node *c = t->child[dir];
t->child[!dir]->release();
t->child[dir] = c->child[!dir];
t->child[!dir] = &epsnode;
c->child[!dir] = t;
return c;
}
return t;
}
static Node *basic_simplify(Node *t, int dir)
{
if (t->is_type(NODE_TYPE_CAT) && &epsnode == t->child[!dir]) {
// aE -> a
Node *tmp = t->child[dir];
t->child[dir] = NULL;
t->release();
return tmp;
}
return basic_alt_factor(t, dir);
}
/*
* assumes a normalized tree. reductions shown for left normalization
* aE -> a
* (a | a) -> a
** factoring patterns
* a | (a | b) -> (a | b)
* a | (ab) -> a (E | b) -> a (b | E)
* (ab) | (ac) -> a(b|c)
*
* returns t - if no simplifications were made
* a new root node - if simplifications were made
*/
Node *simplify_tree_base(Node *t, int dir, bool &mod)
{
if (t->is_type(NODE_TYPE_IMPORTANT))
return t;
for (int i = 0; i < 2; i++) {
if (t->child[i]) {
Node *c = simplify_tree_base(t->child[i], dir, mod);
if (c != t->child[i]) {
t->child[i] = c;
mod = true;
}
}
}
// only iterate on loop if modification made
for (;; mod = true) {
Node *tmp = basic_simplify(t, dir);
if (tmp != t) {
t = tmp;
continue;
}
/* all tests after this must meet 2 alt node condition */
if (!t->is_type(NODE_TYPE_ALT) ||
!t->child[!dir]->is_type(NODE_TYPE_ALT))
break;
// a | (a | b) -> (a | b)
// a | (b | (c | a)) -> (b | (c | a))
Node *p = t;
Node *i = t->child[!dir];
for (; i->is_type(NODE_TYPE_ALT); p = i, i = i->child[!dir]) {
if (t->child[dir]->eq(i->child[dir])) {
Node *tmp = t->child[!dir];
t->child[!dir] = NULL;
t->release();
t = tmp;
continue;
}
}
// last altnode of chain check other dir as well
if (t->child[dir]->eq(p->child[!dir])) {
Node *tmp = t->child[!dir];
t->child[!dir] = NULL;
t->release();
t = tmp;
continue;
}
//exact match didn't work, try factoring front
//a | (ac | (ad | () -> (a (E | c)) | (...)
//ab | (ac | (...)) -> (a (b | c)) | (...)
//ab | (a | (...)) -> (a (b | E)) | (...)
Node *pp;
int count = 0;
Node *subject = t->child[dir];
Node *a = subject;
if (subject->is_type(NODE_TYPE_CAT))
a = subject->child[dir];
for (pp = p = t, i = t->child[!dir];
i->is_type(NODE_TYPE_ALT);) {
if ((i->child[dir]->is_type(NODE_TYPE_CAT) &&
a->eq(i->child[dir]->child[dir])) ||
(a->eq(i->child[dir]))) {
// extract matching alt node
p->child[!dir] = i->child[!dir];
i->child[!dir] = subject;
subject = basic_simplify(i, dir);
if (subject->is_type(NODE_TYPE_CAT))
a = subject->child[dir];
else
a = subject;
i = p->child[!dir];
count++;
} else {
pp = p;
p = i;
i = i->child[!dir];
}
}
// last altnode in chain check other dir as well
if ((i->is_type(NODE_TYPE_CAT) &&
a->eq(i->child[dir])) || (a->eq(i))) {
count++;
if (t == p) {
t->child[dir] = subject;
t = basic_simplify(t, dir);
} else {
t->child[dir] = p->child[dir];
p->child[dir] = subject;
pp->child[!dir] = basic_simplify(p, dir);
}
} else {
t->child[dir] = i;
p->child[!dir] = subject;
}
if (count == 0)
break;
}
return t;
}
int debug_tree(Node *t)
{
int nodes = 1;
if (!t->is_type(NODE_TYPE_IMPORTANT)) {
if (t->child[0])
nodes += debug_tree(t->child[0]);
if (t->child[1])
nodes += debug_tree(t->child[1]);
}
return nodes;
}
static void count_tree_nodes(Node *t, struct node_counts *counts)
{
if (t->is_type(NODE_TYPE_ALT)) {
counts->alt++;
count_tree_nodes(t->child[0], counts);
count_tree_nodes(t->child[1], counts);
} else if (t->is_type(NODE_TYPE_CAT)) {
counts->cat++;
count_tree_nodes(t->child[0], counts);
count_tree_nodes(t->child[1], counts);
} else if (t->is_type(NODE_TYPE_PLUS)) {
counts->plus++;
count_tree_nodes(t->child[0], counts);
} else if (t->is_type(NODE_TYPE_STAR)) {
counts->star++;
count_tree_nodes(t->child[0], counts);
} else if (t->is_type(NODE_TYPE_OPTIONAL)) {
counts->optional++;
count_tree_nodes(t->child[0], counts);
} else if (t->is_type(NODE_TYPE_CHAR)) {
counts->charnode++;
} else if (t->is_type(NODE_TYPE_ANYCHAR)) {
counts->any++;
} else if (t->is_type(NODE_TYPE_CHARSET)) {
counts->charset++;
} else if (t->is_type(NODE_TYPE_NOTCHARSET)) {
counts->notcharset++;
}
}
#include "stdio.h"
#include "stdint.h"
#include "apparmor_re.h"
// maximum number of passes to iterate on the expression tree doing
// simplification passes. Simplification may exit sooner if no changes
// are made.
#define MAX_PASSES 1
Node *simplify_tree(Node *t, optflags const &opts)
{
bool update = true;
int i;
if (opts.dump & DUMP_DFA_TREE_STATS) {
struct node_counts counts = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
count_tree_nodes(t, &counts);
fprintf(stderr,
"expr tree: c %d, [] %d, [^] %d, | %d, + %d, * %d, . %d, cat %d\n",
counts.charnode, counts.charset, counts.notcharset,
counts.alt, counts.plus, counts.star, counts.any,
counts.cat);
}
for (i = 0; update && i < MAX_PASSES; i++) {
update = false;
//default to right normalize first as this reduces the number
//of trailing nodes which might follow an internal *
//or **, which is where state explosion can happen
//eg. in one test this makes the difference between
// the dfa having about 7 thousands states,
// and it having about 1.25 million states
int dir = 1;
if (opts.control & CONTROL_DFA_TREE_LEFT)
dir = 0;
for (int count = 0; count < 2; count++) {
bool modified;
do {
modified = false;
if (opts.control & CONTROL_DFA_TREE_NORMAL)
t->normalize(dir);
t = simplify_tree_base(t, dir, modified);
if (modified)
update = true;
} while (modified);
if (opts.control & CONTROL_DFA_TREE_LEFT)
dir++;
else
dir--;
}
}
if (opts.dump & DUMP_DFA_TREE_STATS) {
struct node_counts counts = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
count_tree_nodes(t, &counts);
fprintf(stderr,
"simplified expr tree: c %d, [] %d, [^] %d, | %d, + %d, * %d, . %d, cat %d\n",
counts.charnode, counts.charset, counts.notcharset,
counts.alt, counts.plus, counts.star, counts.any,
counts.cat);
}
return t;
}
/**
* Flip the children of all cat nodes. This causes strings to be matched
* back-forth.
*/
void flip_tree(Node *node)
{
for (depth_first_traversal i(node); i; i++) {
if ((*i)->is_type(NODE_TYPE_CAT)) {
CatNode *cat = static_cast<CatNode *>(*i);
swap(cat->child[0], cat->child[1]);
}
}
}
void dump_regex_rec(ostream &os, Node *tree)
{
if (tree->child[0])
dump_regex_rec(os, tree->child[0]);
os << *tree;
if (tree->child[1])
dump_regex_rec(os, tree->child[1]);
}
void dump_regex(ostream &os, Node *tree)
{
dump_regex_rec(os, tree);
os << endl;
}