Use an SQLite ruleset DB to speed Firefox startup.

jsha · jsha · commit 915f9149c735 · 2014-01-12T13:04:24.000-08:00
Note that this queries the DB synchronously on
many requests, potentially slowing down browsing.
Needs additional work.
diff --git a/makexpi.sh b/makexpi.sh
@@ -15,6 +15,7 @@ APP_NAME=https-everywhere
 #  ./makexpi.sh 0.2.3.development.2
 
 cd "`dirname $0`"
+RULESETS_SQLITE="$PWD/src/defaults/rulesets.sqlite"
 
 [ -d pkg ] || mkdir pkg
 
@@ -97,6 +98,11 @@ if [ "$1" != "--fast" ] ; then
 fi
 # =============== END VALIDATION ================
 
+if [ "$1" != "--fast" -o ! -f "$RULESETS_SQLITE" ] ; then
+  echo "Generating sqlite DB"
+  ./utils/make-sqlite.py src/chrome/content/rules
+fi
+
 # The name/version of the XPI we're building comes from src/install.rdf
 XPI_NAME="pkg/$APP_NAME-`grep em:version src/install.rdf | sed -e 's/[<>]/	/g' | cut -f3`"
 if [ "$1" ] && [ "$1" != "--fast" ] ; then
@@ -114,14 +120,6 @@ if [ -e "$GIT_OBJECT_FILE" ]; then
 	export GIT_COMMIT_ID=$(cat "$GIT_OBJECT_FILE")
 fi
 
-# Unless we're in a hurry and there's already a ruleset library, build it from
-# the ruleset .xml files
-
-if [ "$1" = "--fast" ] ; then
-  FAST="--fast"
-fi
-python ./utils/merge-rulesets.py $FAST
-
 cd src
 
 # Build the XPI!
@@ -135,7 +133,7 @@ if [ "$ret" != 0 ]; then
     rm -f "../$XPI_NAME"
     exit "$?"
 else
-  echo >&2 "Total included rules: `find chrome/content/rules -name "*.xml" | wc -l`"
+  echo >&2 "Total included rules: `sqlite3 $RULESETS_SQLITE 'select count(*) from rulesets'`"
   echo >&2 "Rules disabled by default: `find chrome/content/rules -name "*.xml" | xargs grep -F default_off | wc -l`"
   echo >&2 "Created $XPI_NAME"
   if [ -n "$BRANCH" ]; then
diff --git a/src/chrome/content/code/HTTPSRules.js b/src/chrome/content/code/HTTPSRules.js
@@ -280,6 +280,12 @@ const RuleWriter = {
 
     sstream.close();
     fstream.close();
+    return this.readFromString(data, rule_store, file);
+  },
+
+  readFromString: function(data, rule_store, file) {
+    if (typeof file === 'undefined') file = {path: 'fromString'};
+
     // XXX: With DOMParser, we probably do not need to throw away the XML
     // declaration anymore nowadays.
     data = data.replace(/<\?xml[^>]*\?>/, ""); 
@@ -410,32 +416,19 @@ const HTTPSRules = {
       this.rulesets = [];
       this.targets = {};  // dict mapping target host patterns -> lists of
                           // applicable rules
+      // dict listing target host patterns that don't exist in the DB
+      // (aka negative cache)
+      // TODO: Make this an LRU cache; clear it on history clear
+      this.nonTargets = {};
       this.rulesetsByID = {};
       this.rulesetsByName = {};
       var t1 = new Date().getTime();
       this.checkMixedContentHandling();
-      var rulefiles = RuleWriter.enumerate(RuleWriter.getCustomRuleDir());
-      this.scanRulefiles(rulefiles);
-      rulefiles = RuleWriter.enumerate(RuleWriter.getRuleDir());
-      this.scanRulefiles(rulefiles);
-      var t,i;
-      for (t in this.targets) {
-        for (i = 0 ; i < this.targets[t].length ; i++) {
-          this.log(INFO, t + " -> " + this.targets[t][i].name);
-        }
-      }
 
-      // for any rulesets with <target host="*">
-      // every URI needs to be checked against these rulesets
-      // (though currently we don't ship any)
-      this.global_rulesets = this.targets["*"] ? this.targets["*"] : [];
-
-      this.rulesets.sort(
-        function(r1,r2) {
-            if (r1.name.toLowerCase() < r2.name.toLowerCase()) return -1;
-            else return 1;
-        }
-      );
+      // Initialize database connection.
+      var dbFile = FileUtils.getFile("ProfD", ["extensions", "https-everywhere@eff.org", "defaults", "rulesets.sqlite"]);
+      var mDBConn = Services.storage.openDatabase(dbFile);
+      this.queryForTarget = mDBConn.createStatement("select id, contents from targets, rulesets where targets.ruleset_id = rulesets.id and host = :target;");
     } catch(e) {
       this.log(WARN,"Rules Failed: "+e);
     }
@@ -491,6 +484,8 @@ const HTTPSRules = {
     }
   },
 
+  httpMatch: /^http/i,
+
   rewrittenURI: function(alist, input_uri) {
     // This function oversees the task of working out if a uri should be
     // rewritten, what it should be rewritten to, and recordkeeping of which
@@ -511,7 +506,7 @@ const HTTPSRules = {
     try {
       var rs = this.potentiallyApplicableRulesets(uri.host);
     } catch(e) {
-      this.log(WARN, 'Could not check applicable rules for '+uri.spec);
+      this.log(WARN, 'Could not check applicable rules for '+uri.spec + '\n'+e);
       return null;
     }
 
@@ -595,31 +590,66 @@ const HTTPSRules = {
         intoList.push(fromList[i]);
   },
 
+  // Try to find a ruleset in the SQLite database for a given target (e.g.
+  // '*.openssl.org')
+  // NOTE: This call runs synchronously, which can lock up the browser UI. Is
+  // there any way to fix that, given that we need to run blocking in the request
+  // flow? Perhaps we can preload all targets from the DB into memory at startup
+  // so we only hit the DB when we know there is something to be had.
+  queryTarget: function(target) {
+    this.log(WARN, "Querying DB for " + target);
+    var statement = this.queryForTarget.clone();
+    statement.params.target = target;
+
+    try {
+      if (statement.executeStep())
+        return statement.row.contents;
+    } finally {
+      statement.reset();
+    }
+  },
+
   potentiallyApplicableRulesets: function(host) {
     // Return a list of rulesets that declare targets matching this host
     var i, tmp, t;
-    var results = this.global_rulesets.slice(0); // copy global_rulesets
-    try {
-      if (this.targets[host])
-        results = results.concat(this.targets[host]);
-    } catch(e) {   
-      this.log(DBUG,"Couldn't check for ApplicableRulesets: " + e);
-      return [];
-    }
+    var results = [];
+
+    var attempt = function(target) {
+      // First check for this target in our in-memory negative cache
+      if (this.nonTargets[target]) {
+        return;
+      } else if (this.targets[target] && // Then our positive cache
+          this.targets[target].length > 0) {
+        this.setInsert(results, this.targets[target]);
+      } else {
+        // If not found there, check the DB and load the ruleset as appropriate
+        // TODO: Add negative caching so we don't repeatedly query the DB for
+        // things that aren't there.
+        var ruleset = this.queryTarget(target);
+        if (ruleset != null) {
+          this.log(INFO, "Found ruleset in DB for " + host + ": " + ruleset);
+          RuleWriter.readFromString(ruleset, this);
+          this.setInsert(results, this.targets[target]);
+        } else {
+          this.nonTargets[target] = 1;
+        }
+      }
+    }.bind(this);
+
     // replace each portion of the domain with a * in turn
     var segmented = host.split(".");
     for (i = 0; i < segmented.length; ++i) {
       tmp = segmented[i];
       segmented[i] = "*";
       t = segmented.join(".");
       segmented[i] = tmp;
-      this.setInsert(results, this.targets[t]);
+      attempt(t);
     }
     // now eat away from the left, with *, so that for x.y.z.google.com we
     // check *.z.google.com and *.google.com (we did *.y.z.google.com above)
     for (i = 1; i <= segmented.length - 2; ++i) {
       t = "*." + segmented.slice(i,segmented.length).join(".");
-      this.setInsert(results, this.targets[t]);
+      attempt(t);
     }
     this.log(DBUG,"Potentially applicable rules for " + host + ":");
     for (i = 0; i < results.length; ++i)
diff --git a/src/components/https-everywhere.js b/src/components/https-everywhere.js
@@ -31,6 +31,9 @@ const Cc = Components.classes;
 const Cu = Components.utils;
 const Cr = Components.results;
 
+Cu.import("resource://gre/modules/Services.jsm");
+Cu.import("resource://gre/modules/FileUtils.jsm");
+
 const CP_SHOULDPROCESS = 4;
 
 const SERVICE_CTRID = "@eff.org/https-everywhere;1";
diff --git a/utils/make-sqlite.py b/utils/make-sqlite.py
@@ -0,0 +1,73 @@
+#!/usr/bin/python2.7
+#
+# Builds an sqlite DB containing all the rulesets, indexed by target.
+
+import sqlite3
+import argparse
+import sys, re, os
+
+from lxml import etree
+
+parser = argparse.ArgumentParser(
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    description="Ruleset validation script.")
+parser.add_argument('ruleset', metavar='XML directory', type=str, nargs="*",
+    default="src/chrome/content/rules",
+    help='Directory of XML files to validate.')
+
+args = parser.parse_args()
+
+def nomes_all(where=sys.argv[1:]):
+    """Returns generator to extract all files from a list of files/dirs"""
+    if not where: where=['.']
+    for i in where:
+        if os.path.isfile(i):
+            yield i
+        elif os.path.isdir(i):
+            for r, d, f in os.walk(i):
+                for fi in f:
+                    yield os.path.join(r, fi)
+
+
+conn = sqlite3.connect(os.path.join(os.path.dirname(__file__), '../src/defaults/rulesets.sqlite'))
+c = conn.cursor()
+c.execute('''DROP TABLE IF EXISTS rulesets''')
+c.execute('''CREATE TABLE rulesets
+             (id INTEGER PRIMARY KEY,
+              name TEXT,
+              contents TEXT)''')
+c.execute('''DROP TABLE IF EXISTS targets''')
+c.execute('''CREATE TABLE targets
+             (
+              host TEXT,
+              ruleset_id INTEGER)''')
+
+parser = etree.XMLParser(remove_blank_text=True)
+
+for fi in nomes_all():
+    try:
+        tree = etree.parse(fi, parser)
+    except Exception as oops:
+        if fi[-4:] != ".xml":
+            continue
+        print("%s failed XML validity: %s\n" % (fi, oops))
+    if not tree.xpath("/ruleset"):
+        continue
+
+    # Remove comments to save space.
+    etree.strip_tags(tree,etree.Comment)
+
+    targets = tree.xpath("/ruleset/target/@host")
+    # TODO: Strip target tags too. Right now the JS code requires there be a
+    # target tag.
+    #etree.strip_tags(tree,'target')
+
+    # TODO: filter out comments and targets to save storage bytes
+    ruleset_name = tree.xpath("/ruleset/@name")[0]
+    c.execute('''INSERT INTO rulesets (name, contents) VALUES(?, ?)''', (ruleset_name, etree.tostring(tree)));
+    ruleset_id = c.lastrowid
+    for target in targets:
+        c.execute('''INSERT INTO targets (host, ruleset_id) VALUES(?, ?)''', (target, ruleset_id));
+
+conn.commit()
+conn.close()