Optimize the reverse dependencies endpoint

sgrif · sgrif · commit 1122ab6afa76 · 2017-03-12T13:11:35.000-04:00
Right now on production the reverse_dependencies endpoint takes around a
second to load. This commit decreases that to ~70ms on my machine.
I've moved the query to its own file to give syntax highlighting and add
comments. The query barely differs from the original though. I've made
two main changes.

The first is using a window function to get the total count instead of a
second query. The second is the addition of `where crate_id =
any(subselect)` when gathering the versions.

Ultimately this query is an intersection between all the max versions,
and the versions which depend on this crate. Prior to this commit, the
query essentially did exactly that. The problem is that even if an index
is used, collecting all the max crates takes ~300ms. This speeds that up
considerably by first scoping the versions to versions of crates which
have ever depended on the target, then collecting the max versions, then
filtering it to versions which depend on the target.

The query looks considerably more complex (and it is), but it's the
difference between the query taking 70ms and the query taking 300ms.

One further alternative would be to have a materialized view for the max
versions, which gets the query time down to ~10ms. However, I don't
think the complexity of adding (and refreshing) the view is worth it to
simplify one query.
diff --git a/migrations/20170312165531_index_dependencies/down.sql b/migrations/20170312165531_index_dependencies/down.sql
@@ -0,0 +1 @@
+DROP INDEX dependencies_crate_id_version_id_idx;
diff --git a/migrations/20170312165531_index_dependencies/up.sql b/migrations/20170312165531_index_dependencies/up.sql
@@ -0,0 +1 @@
+CREATE INDEX ON dependencies (crate_id, version_id);
diff --git a/src/krate.rs b/src/krate.rs
@@ -576,38 +576,18 @@ impl Crate {
                                 offset: i64,
                                 limit: i64)
                                 -> CargoResult<(Vec<(Dependency, String, i32)>, i64)> {
-        let select_sql = "
-              FROM dependencies
-              INNER JOIN (
-                SELECT versions.*,
-                    row_number() OVER (PARTITION BY crate_id ORDER BY to_semver_no_prerelease(num) DESC NULLS LAST) rn
-                    FROM versions
-                    WHERE NOT yanked
-              ) versions
-                ON versions.id = dependencies.version_id
-              INNER JOIN crates
-                ON crates.id = versions.crate_id
-              WHERE dependencies.crate_id = $1
-                AND rn = 1
-        ";
-        let fetch_sql = format!("SELECT DISTINCT ON (crate_downloads, crate_name)
-                                        dependencies.*,
-                                        crates.downloads AS crate_downloads,
-                                        crates.name AS crate_name
-                                        {}
-                               ORDER BY crate_downloads DESC
-                                 OFFSET $2
-                                  LIMIT $3",
-                                select_sql);
-        let count_sql = format!("SELECT COUNT(DISTINCT(crates.id)) {}", select_sql);
-
-        let stmt = conn.prepare(&fetch_sql)?;
-        let vec: Vec<_> = stmt.query(&[&self.id, &offset, &limit])?
+        let stmt = conn.prepare(include_str!("krate_reverse_dependencies.sql"))?;
+
+        let rows = stmt.query(&[&self.id, &offset, &limit])?;
+        let cnt = if rows.is_empty() {
+            0i64
+        } else {
+            rows.get(0).get("total")
+        };
+        let vec: Vec<_> = rows
             .iter()
             .map(|r| (Model::from_row(&r), r.get("crate_name"), r.get("crate_downloads")))
             .collect();
-        let stmt = conn.prepare(&count_sql)?;
-        let cnt: i64 = stmt.query(&[&self.id])?.iter().next().unwrap().get(0);
 
         Ok((vec, cnt))
     }
diff --git a/src/krate_reverse_dependencies.sql b/src/krate_reverse_dependencies.sql
@@ -0,0 +1,37 @@
+-- Apply pagination to the whole thing
+SELECT *, COUNT(*) OVER () as total FROM (
+    -- Multple dependencies can exist, make it distinct
+    SELECT DISTINCT ON (crate_downloads, crate_name)
+    dependencies.*,
+    crates.downloads AS crate_downloads,
+    crates.name AS crate_name
+    FROM dependencies
+    -- We only want the crates whose *max* version is dependent, so we join on a
+    -- subselect that includes the versions with their ordinal position
+    INNER JOIN (
+        SELECT versions.*,
+        row_number() OVER (
+            PARTITION BY crate_id
+            ORDER BY to_semver_no_prerelease(num) DESC NULLS LAST
+        ) rn
+        FROM versions
+        WHERE NOT yanked
+        -- This is completely redundant, but it's faster to filter the versions
+        -- early even if this subselect is done via an index scan.
+        AND crate_id = ANY(
+            SELECT versions.crate_id
+            FROM versions
+            INNER JOIN dependencies
+            ON dependencies.version_id = versions.id
+            WHERE dependencies.crate_id = $1
+        )
+    ) versions
+      ON versions.id = dependencies.version_id
+    INNER JOIN crates
+      ON crates.id = versions.crate_id
+    WHERE dependencies.crate_id = $1
+      AND rn = 1
+    ORDER BY crate_downloads DESC
+) t
+OFFSET $2
+LIMIT $3

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+DROP INDEX dependencies_crate_id_version_id_idx;`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+CREATE INDEX ON dependencies (crate_id, version_id);`