apparmor/parser/libapparmor_re/aare_rules.cc

329 lines
8.4 KiB
C++
Raw Normal View History

/*
* (C) 2006, 2007 Andreas Gruenbacher <agruen@suse.de>
* Copyright (c) 2003-2008 Novell, Inc. (All rights reserved)
* Copyright 2009-2013 Canonical Ltd. (All rights reserved)
*
* The libapparmor library is licensed under the terms of the GNU
* Lesser General Public License, version 2.1. Please see the file
* COPYING.LGPL.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*
* Wrapper around the dfa to convert aa rules into a dfa
*/
#include <ostream>
#include <iostream>
#include <fstream>
#include <sstream>
#include <ext/stdio_filebuf.h>
#include <assert.h>
#include <stdlib.h>
#include "aare_rules.h"
#include "expr-tree.h"
#include "parse.h"
#include "hfa.h"
#include "chfa.h"
#include "../immunix.h"
aare_rules::~aare_rules(void)
{
if (root)
root->release();
unique_perms.clear();
expr_map.clear();
}
bool aare_rules::add_rule(const char *rule, int deny, uint32_t perms,
uint32_t audit, dfaflags_t flags)
{
add ability to use out of band transitions Currently the NULL character is used as an out of band transition for string/path elements. This works for them as the NULL character is not valid for this data. However this does not work for binary data that can contain a NULL character. So far we have only dealt with fixed length fields of binary data making the NULL separator either unnecessary. However binary data like in the xattr match and mount data field are variable length and can contain NULL characters. To deal with this add the ability to specify out of band transitions, that can only be triggered by code not input data. The out of band transition can be used to separate variable length data fields just as the NULL transition has been used to separate variable length strings. In the compressed hfa out of band transitions are expressed as a negative offset from the states base. This leaves us room to expand the character match range in the future if desired and on average makes the range between the out of band transition and the input transitions smaller than would be had if the out of band transition had been stored after the valid input transitions. Out of band transitions in the dfa will not break old kernels that don't know about them, but they won't be able to trigger the out of band transition match. So they should not be used unless the kernel indicates that it supports them. It should be noted that this patch only adds support for a single out of band transition. If multiple out of band transitions are required. It is trivial to extend. - Add a tag indicating support in the kernel - add a oob max range field to the dfa header so the kernel knows what the max range that needs verifying is. - extend oob generation fns to generate oob based on value instead of a fixed -1. Signed-off-by: John Johansen <john.johansen@canonical.com>
2019-08-11 06:18:27 -07:00
return add_rule_vec(deny, perms, audit, 1, &rule, flags, false);
}
void aare_rules::add_to_rules(Node *tree, Node *perms)
{
if (reverse)
flip_tree(tree);
Node *base = expr_map[perms];
if (base)
expr_map[perms] = new AltNode(base, tree);
else
expr_map[perms] = tree;
}
static Node *cat_with_null_separator(Node *l, Node *r)
{
return new CatNode(new CatNode(l, new CharNode(0)), r);
}
static Node *cat_with_oob_separator(Node *l, Node *r)
add ability to use out of band transitions Currently the NULL character is used as an out of band transition for string/path elements. This works for them as the NULL character is not valid for this data. However this does not work for binary data that can contain a NULL character. So far we have only dealt with fixed length fields of binary data making the NULL separator either unnecessary. However binary data like in the xattr match and mount data field are variable length and can contain NULL characters. To deal with this add the ability to specify out of band transitions, that can only be triggered by code not input data. The out of band transition can be used to separate variable length data fields just as the NULL transition has been used to separate variable length strings. In the compressed hfa out of band transitions are expressed as a negative offset from the states base. This leaves us room to expand the character match range in the future if desired and on average makes the range between the out of band transition and the input transitions smaller than would be had if the out of band transition had been stored after the valid input transitions. Out of band transitions in the dfa will not break old kernels that don't know about them, but they won't be able to trigger the out of band transition match. So they should not be used unless the kernel indicates that it supports them. It should be noted that this patch only adds support for a single out of band transition. If multiple out of band transitions are required. It is trivial to extend. - Add a tag indicating support in the kernel - add a oob max range field to the dfa header so the kernel knows what the max range that needs verifying is. - extend oob generation fns to generate oob based on value instead of a fixed -1. Signed-off-by: John Johansen <john.johansen@canonical.com>
2019-08-11 06:18:27 -07:00
{
return new CatNode(new CatNode(l, new CharNode(transchar(-1, true))), r);
}
bool aare_rules::add_rule_vec(int deny, uint32_t perms, uint32_t audit,
add ability to use out of band transitions Currently the NULL character is used as an out of band transition for string/path elements. This works for them as the NULL character is not valid for this data. However this does not work for binary data that can contain a NULL character. So far we have only dealt with fixed length fields of binary data making the NULL separator either unnecessary. However binary data like in the xattr match and mount data field are variable length and can contain NULL characters. To deal with this add the ability to specify out of band transitions, that can only be triggered by code not input data. The out of band transition can be used to separate variable length data fields just as the NULL transition has been used to separate variable length strings. In the compressed hfa out of band transitions are expressed as a negative offset from the states base. This leaves us room to expand the character match range in the future if desired and on average makes the range between the out of band transition and the input transitions smaller than would be had if the out of band transition had been stored after the valid input transitions. Out of band transitions in the dfa will not break old kernels that don't know about them, but they won't be able to trigger the out of band transition match. So they should not be used unless the kernel indicates that it supports them. It should be noted that this patch only adds support for a single out of band transition. If multiple out of band transitions are required. It is trivial to extend. - Add a tag indicating support in the kernel - add a oob max range field to the dfa header so the kernel knows what the max range that needs verifying is. - extend oob generation fns to generate oob based on value instead of a fixed -1. Signed-off-by: John Johansen <john.johansen@canonical.com>
2019-08-11 06:18:27 -07:00
int count, const char **rulev, dfaflags_t flags,
bool oob)
{
Node *tree = NULL, *accept;
int exact_match;
if (regex_parse(&tree, rulev[0]))
return false;
for (int i = 1; i < count; i++) {
Node *subtree = NULL;
if (regex_parse(&subtree, rulev[i]))
goto err;
add ability to use out of band transitions Currently the NULL character is used as an out of band transition for string/path elements. This works for them as the NULL character is not valid for this data. However this does not work for binary data that can contain a NULL character. So far we have only dealt with fixed length fields of binary data making the NULL separator either unnecessary. However binary data like in the xattr match and mount data field are variable length and can contain NULL characters. To deal with this add the ability to specify out of band transitions, that can only be triggered by code not input data. The out of band transition can be used to separate variable length data fields just as the NULL transition has been used to separate variable length strings. In the compressed hfa out of band transitions are expressed as a negative offset from the states base. This leaves us room to expand the character match range in the future if desired and on average makes the range between the out of band transition and the input transitions smaller than would be had if the out of band transition had been stored after the valid input transitions. Out of band transitions in the dfa will not break old kernels that don't know about them, but they won't be able to trigger the out of band transition match. So they should not be used unless the kernel indicates that it supports them. It should be noted that this patch only adds support for a single out of band transition. If multiple out of band transitions are required. It is trivial to extend. - Add a tag indicating support in the kernel - add a oob max range field to the dfa header so the kernel knows what the max range that needs verifying is. - extend oob generation fns to generate oob based on value instead of a fixed -1. Signed-off-by: John Johansen <john.johansen@canonical.com>
2019-08-11 06:18:27 -07:00
if (oob)
tree = cat_with_oob_separator(tree, subtree);
add ability to use out of band transitions Currently the NULL character is used as an out of band transition for string/path elements. This works for them as the NULL character is not valid for this data. However this does not work for binary data that can contain a NULL character. So far we have only dealt with fixed length fields of binary data making the NULL separator either unnecessary. However binary data like in the xattr match and mount data field are variable length and can contain NULL characters. To deal with this add the ability to specify out of band transitions, that can only be triggered by code not input data. The out of band transition can be used to separate variable length data fields just as the NULL transition has been used to separate variable length strings. In the compressed hfa out of band transitions are expressed as a negative offset from the states base. This leaves us room to expand the character match range in the future if desired and on average makes the range between the out of band transition and the input transitions smaller than would be had if the out of band transition had been stored after the valid input transitions. Out of band transitions in the dfa will not break old kernels that don't know about them, but they won't be able to trigger the out of band transition match. So they should not be used unless the kernel indicates that it supports them. It should be noted that this patch only adds support for a single out of band transition. If multiple out of band transitions are required. It is trivial to extend. - Add a tag indicating support in the kernel - add a oob max range field to the dfa header so the kernel knows what the max range that needs verifying is. - extend oob generation fns to generate oob based on value instead of a fixed -1. Signed-off-by: John Johansen <john.johansen@canonical.com>
2019-08-11 06:18:27 -07:00
else
tree = cat_with_null_separator(tree, subtree);
}
/*
* Check if we have an expression with or without wildcards. This
* determines how exec modifiers are merged in accept_perms() based
* on how we split permission bitmasks here.
*/
exact_match = 1;
for (depth_first_traversal i(tree); i && exact_match; i++) {
parser: replace dynamic_cast with is_type method The dynamic_cast operator is slow as it needs to look at RTTI information and even does some string comparisons, especially in deep hierarchies like the one for Node. Profiling with callgrind showed that dynamic_cast can eat a huge portion of the running time, as it takes most of the time that is spent in the simplify_tree() function. For some complex profiles, the number of calls to dynamic_cast can be in the range of millions. This commit replaces the use of dynamic_cast in the Node hierarchy with a method called is_type(), which returns true if the pointer can be casted to the specified type. It works by looking at a Node object field that is an integer with bits set for each type up in the hierarchy. Therefore, dynamic_cast is replaced by a simple bits operation. This change can reduce the compilation times for some profiles more that 50%, especially in arm/arm64 arch. This opens the door to maybe avoid "-O no-expr-simplify" in the snapd daemon, as now that option would make the compilation slower in almost all cases. This is the example profile used in some of my tests, with this change the run-time is around 1/3 of what it was before on an x86 laptop: profile "test" (attach_disconnected,mediate_deleted) { dbus send bus={fcitx,session} path=/inputcontext_[0-9]* interface=org.fcitx.Fcitx.InputContext member="{Close,Destroy,Enable}IC" peer=(label=unconfined), dbus send bus={fcitx,session} path=/inputcontext_[0-9]* interface=org.fcitx.Fcitx.InputContext member=Reset peer=(label=unconfined), dbus receive bus=fcitx peer=(label=unconfined), dbus receive bus=session interface=org.fcitx.Fcitx.* peer=(label=unconfined), dbus send bus={fcitx,session} path=/inputcontext_[0-9]* interface=org.fcitx.Fcitx.InputContext member="Focus{In,Out}" peer=(label=unconfined), dbus send bus={fcitx,session} path=/inputcontext_[0-9]* interface=org.fcitx.Fcitx.InputContext member="{CommitPreedit,Set*}" peer=(label=unconfined), dbus send bus={fcitx,session} path=/inputcontext_[0-9]* interface=org.fcitx.Fcitx.InputContext member="{MouseEvent,ProcessKeyEvent}" peer=(label=unconfined), dbus send bus={fcitx,session} path=/inputcontext_[0-9]* interface=org.freedesktop.DBus.Properties member=GetAll peer=(label=unconfined), dbus (send) bus=session path=/org/a11y/bus interface=org.a11y.Bus member=GetAddress peer=(label=unconfined), dbus (send) bus=session path=/org/a11y/bus interface=org.freedesktop.DBus.Properties member=Get{,All} peer=(label=unconfined), dbus (receive, send) bus=accessibility path=/org/a11y/atspi/** peer=(label=unconfined), dbus (send) bus=system path=/org/freedesktop/Accounts interface=org.freedesktop.DBus.Introspectable member=Introspect peer=(label=unconfined), dbus (send) bus=system path=/org/freedesktop/Accounts interface=org.freedesktop.Accounts member=FindUserById peer=(label=unconfined), dbus (receive, send) bus=system path=/org/freedesktop/Accounts/User[0-9]* interface=org.freedesktop.DBus.Properties member={Get,PropertiesChanged} peer=(label=unconfined), dbus (send) bus=session interface=org.gtk.Actions member=Changed peer=(name=org.freedesktop.DBus, label=unconfined), dbus (receive) bus=session interface=org.gtk.Actions member={Activate,DescribeAll,SetState} peer=(label=unconfined), dbus (receive) bus=session interface=org.gtk.Menus member={Start,End} peer=(label=unconfined), dbus (send) bus=session interface=org.gtk.Menus member=Changed peer=(name=org.freedesktop.DBus, label=unconfined), dbus (send) bus=session path="/com/ubuntu/MenuRegistrar" interface="com.ubuntu.MenuRegistrar" member="{Register,Unregister}{App,Surface}Menu" peer=(label=unconfined), }
2021-02-15 16:26:18 +01:00
if ((*i)->is_type(NODE_TYPE_STAR) ||
(*i)->is_type(NODE_TYPE_PLUS) ||
(*i)->is_type(NODE_TYPE_ANYCHAR) ||
(*i)->is_type(NODE_TYPE_CHARSET) ||
(*i)->is_type(NODE_TYPE_NOTCHARSET))
exact_match = 0;
}
if (reverse)
flip_tree(tree);
accept = unique_perms.insert(deny, perms, audit, exact_match);
if (flags & DFA_DUMP_RULE_EXPR) {
const char *separator;
add ability to use out of band transitions Currently the NULL character is used as an out of band transition for string/path elements. This works for them as the NULL character is not valid for this data. However this does not work for binary data that can contain a NULL character. So far we have only dealt with fixed length fields of binary data making the NULL separator either unnecessary. However binary data like in the xattr match and mount data field are variable length and can contain NULL characters. To deal with this add the ability to specify out of band transitions, that can only be triggered by code not input data. The out of band transition can be used to separate variable length data fields just as the NULL transition has been used to separate variable length strings. In the compressed hfa out of band transitions are expressed as a negative offset from the states base. This leaves us room to expand the character match range in the future if desired and on average makes the range between the out of band transition and the input transitions smaller than would be had if the out of band transition had been stored after the valid input transitions. Out of band transitions in the dfa will not break old kernels that don't know about them, but they won't be able to trigger the out of band transition match. So they should not be used unless the kernel indicates that it supports them. It should be noted that this patch only adds support for a single out of band transition. If multiple out of band transitions are required. It is trivial to extend. - Add a tag indicating support in the kernel - add a oob max range field to the dfa header so the kernel knows what the max range that needs verifying is. - extend oob generation fns to generate oob based on value instead of a fixed -1. Signed-off-by: John Johansen <john.johansen@canonical.com>
2019-08-11 06:18:27 -07:00
if (oob)
separator = "\\-x01";
add ability to use out of band transitions Currently the NULL character is used as an out of band transition for string/path elements. This works for them as the NULL character is not valid for this data. However this does not work for binary data that can contain a NULL character. So far we have only dealt with fixed length fields of binary data making the NULL separator either unnecessary. However binary data like in the xattr match and mount data field are variable length and can contain NULL characters. To deal with this add the ability to specify out of band transitions, that can only be triggered by code not input data. The out of band transition can be used to separate variable length data fields just as the NULL transition has been used to separate variable length strings. In the compressed hfa out of band transitions are expressed as a negative offset from the states base. This leaves us room to expand the character match range in the future if desired and on average makes the range between the out of band transition and the input transitions smaller than would be had if the out of band transition had been stored after the valid input transitions. Out of band transitions in the dfa will not break old kernels that don't know about them, but they won't be able to trigger the out of band transition match. So they should not be used unless the kernel indicates that it supports them. It should be noted that this patch only adds support for a single out of band transition. If multiple out of band transitions are required. It is trivial to extend. - Add a tag indicating support in the kernel - add a oob max range field to the dfa header so the kernel knows what the max range that needs verifying is. - extend oob generation fns to generate oob based on value instead of a fixed -1. Signed-off-by: John Johansen <john.johansen@canonical.com>
2019-08-11 06:18:27 -07:00
else
separator = "\\x00";
cerr << "rule: ";
cerr << rulev[0];
for (int i = 1; i < count; i++) {
cerr << separator;
cerr << rulev[i];
}
cerr << " -> ";
tree->dump(cerr);
if (deny)
cerr << " deny";
cerr << " (0x" << hex << perms <<"/" << audit << dec << ")";
accept->dump(cerr);
cerr << "\n\n";
}
add_to_rules(tree, accept);
rule_count++;
return true;
err:
delete tree;
return false;
}
/*
* append_rule is like add_rule, but appends the rule to any existing rules
* with a separating transition. The appended rule matches with the same
* permissions as the rule it's appended to. If there are no existing rules
* append_rule returns true.
*
* This is used by xattrs matching where, after matching the path, the DFA is
* advanced by a null character for each xattr.
*/
bool aare_rules::append_rule(const char *rule, bool oob, bool with_perm,
dfaflags_t flags)
{
Node *tree = NULL;
if (regex_parse(&tree, rule))
return false;
if (flags & DFA_DUMP_RULE_EXPR) {
cerr << "rule: ";
cerr << rule;
cerr << " -> ";
tree->dump(cerr);
cerr << "\n\n";
}
/*
* For each matching state, we want to create an optional path
* separated by a separating character.
*
* When matching xattrs, the DFA must end up in an accepting state for
* the path, then each value of the xattrs. Using an optional node
* lets each rule end up in an accepting state.
*/
tree = new CatNode(oob ? new CharNode(transchar(-1, true)) : new CharNode(0), tree);
if (expr_map.size() == 0) {
// There's nothing to append to. Free the tree reference.
delete tree;
return true;
}
PermExprMap::iterator it;
for (it = expr_map.begin(); it != expr_map.end(); it++) {
if (with_perm)
expr_map[it->first] = new CatNode(it->second, new AltNode(it->first, tree));
else
expr_map[it->first] = new CatNode(it->second, tree);
}
return true;
}
/* create a dfa from the ruleset
* returns: buffer contain dfa tables, @size set to the size of the tables
* else NULL on failure, @min_match_len set to the shortest string
* that can match the dfa for determining xmatch priority.
*/
void *aare_rules::create_dfa(size_t *size, int *min_match_len, dfaflags_t flags,
bool filedfa)
{
char *buffer = NULL;
/* finish constructing the expr tree from the different permission
* set nodes */
PermExprMap::iterator i = expr_map.begin();
if (i != expr_map.end()) {
Move rule simplification into the tree construction phase The current rule simplification algorithm has issues that need to be addressed in a rewrite, but it is still often a win, especially for larger profiles. However doing rule simplification as a single pass limits what it can do. We default to right simplification first because this has historically shown the most benefits. For two reasons 1. It allowed better grouping of the split out accept nodes that we used to do (changed in previous patches) 2. because trailing regexes like /foo/**, /foo/**.txt, can be combined and they are the largest source of node set explosion. However the move to unique node sets, eliminates 1, and forces 2 to work within only the single unique permission set on the right side factoring pass, but it still incures the penalty of walking the whole tree looking for potential nodes to factor. Moving tree simplification into the construction phases gets rid of the need for the right side factoring pass to walk other node sets that will never combine, and since we are doing simplification we can do it before the cat and permission nodes are added reducing the set of nodes to look at by another two. We do loose the ability to combine nodes from different sets during the left factoring pass, but experimentation shows that doing simplification only within the unique permission sets achieve most of the factoring that a single global pass would achieve. Signed-off-by: John Johansen <john.johansen@canonical.com> Acked-by: Steve Beattie <steve@nxnw.org>
2015-06-25 16:38:04 -06:00
if (flags & DFA_CONTROL_TREE_SIMPLE) {
Node *tmp = simplify_tree(i->second, flags);
root = new CatNode(tmp, i->first);
} else
root = new CatNode(i->second, i->first);
for (i++; i != expr_map.end(); i++) {
Move rule simplification into the tree construction phase The current rule simplification algorithm has issues that need to be addressed in a rewrite, but it is still often a win, especially for larger profiles. However doing rule simplification as a single pass limits what it can do. We default to right simplification first because this has historically shown the most benefits. For two reasons 1. It allowed better grouping of the split out accept nodes that we used to do (changed in previous patches) 2. because trailing regexes like /foo/**, /foo/**.txt, can be combined and they are the largest source of node set explosion. However the move to unique node sets, eliminates 1, and forces 2 to work within only the single unique permission set on the right side factoring pass, but it still incures the penalty of walking the whole tree looking for potential nodes to factor. Moving tree simplification into the construction phases gets rid of the need for the right side factoring pass to walk other node sets that will never combine, and since we are doing simplification we can do it before the cat and permission nodes are added reducing the set of nodes to look at by another two. We do loose the ability to combine nodes from different sets during the left factoring pass, but experimentation shows that doing simplification only within the unique permission sets achieve most of the factoring that a single global pass would achieve. Signed-off-by: John Johansen <john.johansen@canonical.com> Acked-by: Steve Beattie <steve@nxnw.org>
2015-06-25 16:38:04 -06:00
Node *tmp;
if (flags & DFA_CONTROL_TREE_SIMPLE) {
tmp = simplify_tree(i->second, flags);
} else
tmp = i->second;
root = new AltNode(root, new CatNode(tmp, i->first));
}
}
*min_match_len = root->min_match_len();
Move rule simplification into the tree construction phase The current rule simplification algorithm has issues that need to be addressed in a rewrite, but it is still often a win, especially for larger profiles. However doing rule simplification as a single pass limits what it can do. We default to right simplification first because this has historically shown the most benefits. For two reasons 1. It allowed better grouping of the split out accept nodes that we used to do (changed in previous patches) 2. because trailing regexes like /foo/**, /foo/**.txt, can be combined and they are the largest source of node set explosion. However the move to unique node sets, eliminates 1, and forces 2 to work within only the single unique permission set on the right side factoring pass, but it still incures the penalty of walking the whole tree looking for potential nodes to factor. Moving tree simplification into the construction phases gets rid of the need for the right side factoring pass to walk other node sets that will never combine, and since we are doing simplification we can do it before the cat and permission nodes are added reducing the set of nodes to look at by another two. We do loose the ability to combine nodes from different sets during the left factoring pass, but experimentation shows that doing simplification only within the unique permission sets achieve most of the factoring that a single global pass would achieve. Signed-off-by: John Johansen <john.johansen@canonical.com> Acked-by: Steve Beattie <steve@nxnw.org>
2015-06-25 16:38:04 -06:00
/* dumping of the none simplified tree without -O no-expr-simplify
* is broken because we need to build the tree above first, and
* simplification is woven into the build. Reevaluate how to fix
* this debug dump.
*/
label_nodes(root);
if (flags & DFA_DUMP_TREE) {
cerr << "\nDFA: Expression Tree\n";
root->dump(cerr);
cerr << "\n\n";
}
if (flags & DFA_CONTROL_TREE_SIMPLE) {
Move rule simplification into the tree construction phase The current rule simplification algorithm has issues that need to be addressed in a rewrite, but it is still often a win, especially for larger profiles. However doing rule simplification as a single pass limits what it can do. We default to right simplification first because this has historically shown the most benefits. For two reasons 1. It allowed better grouping of the split out accept nodes that we used to do (changed in previous patches) 2. because trailing regexes like /foo/**, /foo/**.txt, can be combined and they are the largest source of node set explosion. However the move to unique node sets, eliminates 1, and forces 2 to work within only the single unique permission set on the right side factoring pass, but it still incures the penalty of walking the whole tree looking for potential nodes to factor. Moving tree simplification into the construction phases gets rid of the need for the right side factoring pass to walk other node sets that will never combine, and since we are doing simplification we can do it before the cat and permission nodes are added reducing the set of nodes to look at by another two. We do loose the ability to combine nodes from different sets during the left factoring pass, but experimentation shows that doing simplification only within the unique permission sets achieve most of the factoring that a single global pass would achieve. Signed-off-by: John Johansen <john.johansen@canonical.com> Acked-by: Steve Beattie <steve@nxnw.org>
2015-06-25 16:38:04 -06:00
/* This is old total tree, simplification point
* For now just do simplification up front. It gets most
* of the benefit running on the smaller chains, and is
* overall faster because there are less nodes. Reevaluate
* once tree simplification is rewritten
*/
//root = simplify_tree(root, flags);
if (flags & DFA_DUMP_SIMPLE_TREE) {
cerr << "\nDFA: Simplified Expression Tree\n";
root->dump(cerr);
cerr << "\n\n";
}
}
stringstream stream;
try {
DFA dfa(root, flags, filedfa);
if (flags & DFA_DUMP_UNIQ_PERMS)
dfa.dump_uniq_perms("dfa");
if (flags & DFA_CONTROL_MINIMIZE) {
dfa.minimize(flags);
if (flags & DFA_DUMP_MIN_UNIQ_PERMS)
dfa.dump_uniq_perms("minimized dfa");
}
if (flags & DFA_CONTROL_FILTER_DENY &&
flags & DFA_CONTROL_MINIMIZE &&
dfa.apply_and_clear_deny()) {
/* Do a second minimization pass as removal of deny
* information has moved some states from accepting
* to none accepting partitions
*
* TODO: add this as a tail pass to minimization
* so we don't need to do a full second pass
*/
dfa.minimize(flags);
if (flags & DFA_DUMP_MIN_UNIQ_PERMS)
dfa.dump_uniq_perms("minimized dfa");
}
if (flags & DFA_CONTROL_REMOVE_UNREACHABLE)
dfa.remove_unreachable(flags);
if (flags & DFA_DUMP_STATES)
dfa.dump(cerr);
if (flags & DFA_DUMP_GRAPH)
dfa.dump_dot_graph(cerr);
map<transchar, transchar> eq;
if (flags & DFA_CONTROL_EQUIV) {
eq = dfa.equivalence_classes(flags);
dfa.apply_equivalence_classes(eq);
if (flags & DFA_DUMP_EQUIV) {
cerr << "\nDFA equivalence class\n";
dump_equivalence_classes(cerr, eq);
}
} else if (flags & DFA_DUMP_EQUIV)
cerr << "\nDFA did not generate an equivalence class\n";
Add Differential State Compression to the DFA Differential state compression encodes a state's transitions as the difference between the state and its default state (the state it is relative too). This reduces the number of transitions that need to be stored in the transition table, hence reducing the size of the dfa. There is a trade off in that a single input character may have to traverse more than one state. This is somewhat offset by reduced table sizes providing better locality and caching properties. With carefully encoding we can still make constant match time guarentees. This patch guarentees that a state that is differentially encoded will do at most 3m state traversal to match an input of length m (as opposed to a non-differentially compressed dfa doing exactly m state traversals). In practice the actually number of extra traversals is less than this becaus we selectively choose which states are differentially encoded. In addition to reducing the size of the dfa by reducing the number of transitions that have to be stored. Differential encoding reduces the number of transitions that need to be considered by comb compression, which can result in tighter packing, due to a reduction in sparseness, and also reduces the time spent in comb compression which currently uses an O(n^2) algorithm. Differential encoding will always result in a DFA that is smaller or equal in size to the encoded DFA, and will usually improve compilation times, with the performance improvements increasing as the DFA gets larger. Eg. Given a example DFA that created 8991 states after minimization. * If only comb compression (current default) is used 52057 transitions are packed into a table of 69591 entries. Achieving an efficiency of about 75% (an average of about 7.74 table entries per state). With a resulting compressed dfa16 size of 404238 bytes and a run time for the dfa compilation of real 0m9.037s user 0m8.893s sys 0m0.036s * If differential encoding + comb compression is used, 8292 of the 8991 states are differentially encoded, with 31557 trans removed. Resulting in 20500 transitions are packed into a table of 20675 entries. Acheiving an efficiency of about 99.2% (an average of about 2.3 table entries per state With a resulting compressed dfa16 size of 207874 bytes (about 48.6% reduction) and a run time for the dfa compilation of real 0m5.416s (about 40% faster) user 0m5.280s sys 0m0.040s Repeating with a larger DFA that has 17033 states after minimization. * If only comb compression (current default) is used 102992 transitions are packed into a table of 137987 entries. Achieving an efficiency of about 75% (an average of about 8.10 entries per state). With a resultant compressed dfa16 size of 790410 bytes and a run time for d compilation of real 0m28.153s user 0m27.634s sys 0m0.120s * with differential encoding 39374 transition are packed into a table of 39594 entries. Achieving an efficiency of about 99.4% (an average of about 2.32 entries per state). With a resultant compressed dfa16 size of 396838 bytes (about 50% reduction and a run time for dfa compilation of real 0m11.804s (about 58% faster) user 0m11.657s sys 0m0.084s Signed-off-by: John Johansen <john.johansen@canonical.com> Acked-by: Seth Arnold <seth.arnold@canonical.com>
2014-01-09 16:55:55 -08:00
if (flags & DFA_CONTROL_DIFF_ENCODE) {
dfa.diff_encode(flags);
if (flags & DFA_DUMP_DIFF_ENCODE)
dfa.dump_diff_encode(cerr);
}
CHFA chfa(dfa, eq, flags);
if (flags & DFA_DUMP_TRANS_TABLE)
chfa.dump(cerr);
chfa.flex_table(stream, "");
}
catch(int error) {
*size = 0;
return NULL;
}
stringbuf *buf = stream.rdbuf();
buf->pubseekpos(0);
*size = buf->in_avail();
buffer = (char *)malloc(*size);
if (!buffer)
return NULL;
buf->sgetn(buffer, *size);
return buffer;
}