Switch to using JSON to store rulesets.

jsha · jsha · commit 31a0018859b4 · 2016-02-07T20:49:38.000-08:00
This loads much faster, and avoids reads from disk after startup.

Previously, loading all targets from SQLite took about 268ms. Now it takes about
60ms, including reading the file from disk and parsing JSON.
diff --git a/src/chrome/content/code/HTTPSRules.js b/src/chrome/content/code/HTTPSRules.js
@@ -324,7 +324,7 @@ const RuleWriter = {
     // Add this ruleset id into HTTPSRules.targets if it's not already there.
     // This should only happen for custom user rules. Built-in rules get
     // their ids preloaded into the targets map, and have their <target>
-    // tags stripped when the sqlite database is built.
+    // tags stripped when the JSON database is built.
     var targets = xmlruleset.getElementsByTagName("target");
     for (var i = 0; i < targets.length; i++) {
       var host = targets[i].getAttribute("host");
@@ -392,29 +392,7 @@ const HTTPSRules = {
       var rulefiles = RuleWriter.enumerate(RuleWriter.getCustomRuleDir());
       this.scanRulefiles(rulefiles);
 
-      // Initialize database connection.
-      var dbFile = new FileUtils.File(RuleWriter.chromeToPath("chrome://https-everywhere/content/rulesets.sqlite"));
-      var rulesetDBConn = Services.storage.openDatabase(dbFile);
-      this.queryForRuleset = rulesetDBConn.createStatement(
-        "select contents from rulesets where id = :id");
-
-      // Preload the mapping of hostname target -> ruleset ID from DB.
-      // This is a little slow (287 ms on a Core2 Duo @ 2.2GHz with SSD),
-      // but is faster than loading all of the rulesets. If this becomes a
-      // bottleneck, change it to load in a background webworker, or load
-      // a smaller bloom filter instead.
-      var targetsQuery = rulesetDBConn.createStatement("select host, ruleset_id from targets");
-      this.log(DBUG, "Loading targets...");
-      while (targetsQuery.executeStep()) {
-        var host = targetsQuery.row.host;
-        var id = targetsQuery.row.ruleset_id;
-        if (!this.targets[host]) {
-          this.targets[host] = [id];
-        } else {
-          this.targets[host].push(id);
-        }
-      }
-      this.log(DBUG, "Loading adding targets.");
+      this.loadTargets();
     } catch(e) {
       this.log(DBUG,"Rules Failed: "+e);
     }
@@ -424,6 +402,13 @@ const HTTPSRules = {
     return;
   },
 
+  loadTargets: function() {
+    var file = new FileUtils.File(RuleWriter.chromeToPath("chrome://https-everywhere/content/rulesets.json"));
+    var rules = JSON.parse(RuleWriter.read(file));
+    this.targets = rules.targets;
+    this.rules_list = rules.rules_list;
+  },
+
   checkMixedContentHandling: function() {
     // Firefox 23+ blocks mixed content by default, so rulesets that create
     // mixed content situations should be disabled there
@@ -576,22 +561,8 @@ const HTTPSRules = {
   },
 
   // Load a ruleset by numeric id, e.g. 234
-  // NOTE: This call runs synchronously, which can lock up the browser UI. Is
-  // there any way to fix that, given that we need to run blocking in the request
-  // flow? Perhaps we can preload all targets from the DB into memory at startup
-  // so we only hit the DB when we know there is something to be had.
   loadRulesetById: function(ruleset_id) {
-    this.queryForRuleset.params.id = ruleset_id;
-
-    try {
-      if (this.queryForRuleset.executeStep()) {
-        RuleWriter.readFromString(this.queryForRuleset.row.contents, this, ruleset_id);
-      } else {
-        this.log(WARN,"Couldn't find ruleset for id " + ruleset_id);
-      }
-    } finally {
-      this.queryForRuleset.reset();
-    }
+    RuleWriter.readFromString(this.rules_list[ruleset_id], this, ruleset_id);
   },
 
   // Get all rulesets matching a given target, lazy-loading from DB as necessary.
diff --git a/src/components/https-everywhere.js b/src/components/https-everywhere.js
@@ -569,7 +569,7 @@ HTTPSEverywhere.prototype = {
                 break;
         }
     } else if (topic == "browser:purge-session-history") {
-      // The list of rulesets that have been loaded from the sqlite DB
+      // The list of rulesets that have been loaded from the JSON DB
       // constitutes a parallel history store, so we have to clear it.
       this.log(DBUG, "History cleared, reloading HTTPSRules to avoid information leak.");
       HTTPSRules.init();
diff --git a/utils/make-sqlite.py b/utils/make-sqlite.py
@@ -4,13 +4,14 @@
 
 import glob
 import locale
+import json
 import os
 import re
 import sqlite3
 import subprocess
 import sys
 
-from collections import Counter
+import collections
 from lxml import etree
 
 # Explicitly set locale so sorting order for filenames is consistent.
@@ -19,6 +20,7 @@
 # It's also helpful to ensure consistency for the lowercase check below.
 locale.setlocale(locale.LC_ALL, 'C')
 
+json_path = os.path.join(os.path.dirname(__file__), '../pkg/rulesets.json')
 # Removing the file before we create it avoids some non-determinism.
 db_path = os.path.join(os.path.dirname(__file__), '../pkg/rulesets.unvalidated.sqlite')
 if os.path.isfile(db_path):
@@ -34,6 +36,11 @@
              (host TEXT,
               ruleset_id INTEGER)''')
 
+json_output = {
+    "rules_list": [],
+    "targets": collections.defaultdict(list)
+}
+
 parser = etree.XMLParser(remove_blank_text=True)
 
 # Precompile xpath expressions that get run repeatedly.
@@ -43,7 +50,7 @@
 # Sort filenames so output is deterministic.
 filenames = sorted(glob.glob('src/chrome/content/rules/*'))
 
-counted_lowercase_names = Counter([name.lower() for name in filenames])
+counted_lowercase_names = collections.Counter([name.lower() for name in filenames])
 most_common_entry = counted_lowercase_names.most_common(1)[0]
 if most_common_entry[1] > 1:
     dupe_filename = re.compile(re.escape(most_common_entry[0]), re.IGNORECASE)
@@ -80,6 +87,7 @@
     # targets are looked up in the target table, which has a foreign key
     # pointing into the ruleset table.
     etree.strip_tags(tree, 'target')
+    etree.strip_tags(tree, 'test')
 
     # Store the filename in the `f' attribute so "view source XML" for rules in
     # FF version can find it.
@@ -89,7 +97,14 @@
     ruleset_id = c.lastrowid
     for target in targets:
         c.execute('''INSERT INTO targets (host, ruleset_id) VALUES(?, ?)''', (target, ruleset_id))
+        # id is the current length of the rules list - i.e. the offset at which
+        # this rule will be added in the list.
+        json_output["targets"][target].append(len(json_output["rules_list"]))
+    json_output["rules_list"].append(etree.tostring(tree))
 
 conn.commit()
 conn.execute("VACUUM")
 conn.close()
+
+with open(json_path, 'w') as f:
+    f.write(json.dumps(json_output))
diff --git a/utils/validate.sh b/utils/validate.sh
@@ -56,4 +56,4 @@ else
   die "Validation of rulesets against $GRAMMAR failed."
 fi
 
-cp "$INPUT" ../src/defaults/rulesets.sqlite
+cp "../pkg/rulesets.json" ../src/chrome/content/rulesets.json

Original file line number	Diff line number	Diff line change
`@@ -569,7 +569,7 @@ HTTPSEverywhere.prototype = {`
`569`	`569`	`break;`
`570`	`570`	`}`
`571`	`571`	`} else if (topic == "browser:purge-session-history") {`
`572`		`- // The list of rulesets that have been loaded from the sqlite DB`
	`572`	`+ // The list of rulesets that have been loaded from the JSON DB`
`573`	`573`	`// constitutes a parallel history store, so we have to clear it.`
`574`	`574`	`this.log(DBUG, "History cleared, reloading HTTPSRules to avoid information leak.");`
`575`	`575`	`HTTPSRules.init();`