mirror of
https://github.com/outline/outline.git
synced 2026-06-13 03:14:59 +03:00
feat: Add popularity scoring (#10721)
* Simple first pass * Use findAllInBatches * Add comments,views,revisions * Add 'popular' tab to Home * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Add 'Popular' tab to collections * Boost search results based on popularityScore * Move to unlogged temp table --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -203,6 +203,9 @@ export default class Document extends ArchivableModel implements Searchable {
|
||||
@observable
|
||||
publishedAt: string | undefined;
|
||||
|
||||
@observable
|
||||
popularityScore: number;
|
||||
|
||||
/**
|
||||
* @deprecated Use path instead
|
||||
*/
|
||||
|
||||
@@ -57,6 +57,7 @@ const ShareButton = lazyWithRetry(() => import("./components/ShareButton"));
|
||||
enum CollectionPath {
|
||||
Overview = "overview",
|
||||
Recent = "recent",
|
||||
Popular = "popular",
|
||||
Updated = "updated",
|
||||
Published = "published",
|
||||
Old = "old",
|
||||
@@ -242,6 +243,9 @@ const CollectionScene = observer(function _CollectionScene() {
|
||||
<Tab {...tabProps(CollectionPath.Recent)}>{t("Documents")}</Tab>
|
||||
{!collection.isArchived && (
|
||||
<>
|
||||
<Tab {...tabProps(CollectionPath.Popular)}>
|
||||
{t("Popular")}
|
||||
</Tab>
|
||||
<Tab {...tabProps(CollectionPath.Updated)}>
|
||||
{t("Recently updated")}
|
||||
</Tab>
|
||||
@@ -353,6 +357,21 @@ const CollectionScene = observer(function _CollectionScene() {
|
||||
}}
|
||||
/>
|
||||
</Route>
|
||||
<Route
|
||||
path={collectionPath(
|
||||
collection.path,
|
||||
CollectionPath.Popular
|
||||
)}
|
||||
>
|
||||
<PaginatedDocumentList
|
||||
key="popular"
|
||||
documents={documents.popularInCollection(collection.id)}
|
||||
fetch={documents.fetchPopular}
|
||||
options={{
|
||||
collectionId: collection.id,
|
||||
}}
|
||||
/>
|
||||
</Route>
|
||||
<Route
|
||||
path={collectionPath(
|
||||
collection.path,
|
||||
|
||||
+17
-1
@@ -58,6 +58,9 @@ function Home() {
|
||||
<Tab to="/home" exact>
|
||||
{t("Recently viewed")}
|
||||
</Tab>
|
||||
<Tab to="/home/popular" exact>
|
||||
{t("Popular")}
|
||||
</Tab>
|
||||
<Tab to="/home/recent" exact>
|
||||
{t("Recently updated")}
|
||||
</Tab>
|
||||
@@ -68,7 +71,20 @@ function Home() {
|
||||
<PaginatedDocumentList
|
||||
documents={documents.recentlyUpdated}
|
||||
fetch={documents.fetchRecentlyUpdated}
|
||||
empty={<Empty>{t("Weird, this shouldn’t ever be empty")}</Empty>}
|
||||
empty={<Empty>{t("Weird, this shouldn't ever be empty")}</Empty>}
|
||||
showCollection
|
||||
/>
|
||||
</Route>
|
||||
<Route path="/home/popular">
|
||||
<PaginatedDocumentList
|
||||
key="popular"
|
||||
documents={documents.popular}
|
||||
fetch={documents.fetchPopular}
|
||||
empty={
|
||||
<Empty>
|
||||
{t("Documents with recent activity will appear here")}
|
||||
</Empty>
|
||||
}
|
||||
showCollection
|
||||
/>
|
||||
</Route>
|
||||
|
||||
@@ -96,6 +96,11 @@ export default class DocumentsStore extends Store<Document> {
|
||||
return orderBy(this.all, "updatedAt", "desc");
|
||||
}
|
||||
|
||||
@computed
|
||||
get popular(): Document[] {
|
||||
return orderBy(this.all, "popularityScore", "desc");
|
||||
}
|
||||
|
||||
@computed
|
||||
get templates(): Document[] {
|
||||
return orderBy(
|
||||
@@ -208,6 +213,10 @@ export default class DocumentsStore extends Store<Document> {
|
||||
return naturalSort(this.inCollection(collectionId), "title");
|
||||
}
|
||||
|
||||
popularInCollection(collectionId: string): Document[] {
|
||||
return orderBy(this.inCollection(collectionId), "popularityScore", "desc");
|
||||
}
|
||||
|
||||
get(id: string): Document | undefined {
|
||||
return id
|
||||
? (this.data.get(id) ??
|
||||
@@ -386,6 +395,14 @@ export default class DocumentsStore extends Store<Document> {
|
||||
options?: PaginationParams
|
||||
): Promise<Document[]> => this.fetchNamedPage("viewed", options);
|
||||
|
||||
@action
|
||||
fetchPopular = async (options?: PaginationParams): Promise<Document[]> =>
|
||||
this.fetchNamedPage("list", {
|
||||
sort: "popularityScore",
|
||||
direction: "DESC",
|
||||
...options,
|
||||
});
|
||||
|
||||
@action
|
||||
fetchStarred = (options?: PaginationParams): Promise<Document[]> =>
|
||||
this.fetchNamedPage("starred", options);
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
"use strict";
|
||||
|
||||
/** @type {import('sequelize-cli').Migration} */
|
||||
module.exports = {
|
||||
async up(queryInterface, Sequelize) {
|
||||
await queryInterface.addColumn("documents", "popularityScore", {
|
||||
type: Sequelize.FLOAT,
|
||||
allowNull: false,
|
||||
defaultValue: 0,
|
||||
});
|
||||
},
|
||||
|
||||
async down(queryInterface, Sequelize) {
|
||||
await queryInterface.removeColumn("documents", "popularityScore");
|
||||
},
|
||||
};
|
||||
@@ -40,6 +40,7 @@ import {
|
||||
BelongsToMany,
|
||||
Unique,
|
||||
AfterUpdate,
|
||||
IsFloat,
|
||||
} from "sequelize-typescript";
|
||||
import { MaxLength } from "class-validator";
|
||||
import isUUID from "validator/lib/isUUID";
|
||||
@@ -382,6 +383,13 @@ class Document extends ArchivableModel<
|
||||
@Column(DataType.INTEGER)
|
||||
revisionCount: number;
|
||||
|
||||
/** A score representing the popularity of this document based on views and engagement. */
|
||||
@IsFloat
|
||||
@Default(0)
|
||||
@Column(DataType.FLOAT)
|
||||
@SkipChangeset
|
||||
popularityScore: number;
|
||||
|
||||
/** Whether the document is published, and if so when. */
|
||||
@IsDate
|
||||
@Column
|
||||
|
||||
@@ -483,9 +483,11 @@ export default class SearchHelper {
|
||||
const order: Order = [["updatedAt", "DESC"]];
|
||||
|
||||
if (query) {
|
||||
// Combine text relevance with logarithmic popularity boost
|
||||
// Popular documents get a boost, but text relevance remains primary
|
||||
attributes.push([
|
||||
Sequelize.literal(
|
||||
`ts_rank("searchVector", to_tsquery('english', :query))`
|
||||
`ts_rank("searchVector", to_tsquery('english', :query)) * (1 + LN(1 + COALESCE("popularityScore", 0)))`
|
||||
),
|
||||
"searchRanking",
|
||||
]);
|
||||
|
||||
@@ -94,6 +94,7 @@ async function presentDocument(
|
||||
res.templateId = document.templateId;
|
||||
res.template = document.template;
|
||||
res.insightsEnabled = document.insightsEnabled;
|
||||
res.popularityScore = document.popularityScore;
|
||||
res.sourceMetadata = document.sourceMetadata
|
||||
? {
|
||||
importedAt: source?.createdAt ?? document.createdAt,
|
||||
|
||||
@@ -0,0 +1,370 @@
|
||||
import crypto from "crypto";
|
||||
import { subWeeks } from "date-fns";
|
||||
import { QueryTypes } from "sequelize";
|
||||
import Logger from "@server/logging/Logger";
|
||||
import BaseTask, { TaskSchedule } from "./BaseTask";
|
||||
import { sequelize } from "@server/storage/database";
|
||||
|
||||
type Props = Record<string, never>;
|
||||
|
||||
/**
|
||||
* Gravity constant for time decay. Higher values cause faster decay of older content.
|
||||
* With `GRAVITY = 0.7`:
|
||||
* - Content from **1 day ago** retains ~30% of its score
|
||||
* - Content from **3 days ago** retains ~15% of its score
|
||||
* - Content from **1 week ago** retains ~8% of its score
|
||||
* - Content from **2 weeks ago** retains ~4% of its score
|
||||
*/
|
||||
const GRAVITY = 0.7;
|
||||
|
||||
/**
|
||||
* Number of hours to add to age to smooth the decay curve,
|
||||
* preventing brand new content from having disproportionately
|
||||
* high scores compared to content just a few hours old.
|
||||
*/
|
||||
const TIME_OFFSET_HOURS = 2;
|
||||
|
||||
/**
|
||||
* Weight multipliers for different activity types relative to base score
|
||||
*/
|
||||
const ACTIVITY_WEIGHTS = {
|
||||
revision: 1.0,
|
||||
comment: 1.2,
|
||||
view: 0.5,
|
||||
};
|
||||
|
||||
/**
|
||||
* Only recalculate scores for activity within this period.
|
||||
*/
|
||||
const ACTIVITY_THRESHOLD_WEEKS = 2;
|
||||
|
||||
/**
|
||||
* Batch size for processing updates - each batch is an independent transaction
|
||||
*/
|
||||
const BATCH_SIZE = 500;
|
||||
|
||||
/**
|
||||
* Maximum retries for failed batch operations
|
||||
*/
|
||||
const MAX_RETRIES = 3;
|
||||
|
||||
/**
|
||||
* Delay between retries in milliseconds
|
||||
*/
|
||||
const RETRY_DELAY_MS = 1000;
|
||||
|
||||
/**
|
||||
* Base name for the working table used to track documents to process
|
||||
*/
|
||||
const WORKING_TABLE_PREFIX = "popularity_score_working";
|
||||
|
||||
export default class UpdateDocumentsPopularityScoreTask extends BaseTask<Props> {
|
||||
/**
|
||||
* Unique table name for this task run to prevent conflicts with concurrent runs
|
||||
*/
|
||||
private workingTable: string = "";
|
||||
static cron = TaskSchedule.Day;
|
||||
|
||||
public async perform() {
|
||||
Logger.info("task", "Updating document popularity scores…");
|
||||
|
||||
const now = new Date();
|
||||
const activityThreshold = subWeeks(now, ACTIVITY_THRESHOLD_WEEKS);
|
||||
|
||||
// Generate unique table name for this run to prevent conflicts
|
||||
const uniqueId = crypto.randomBytes(8).toString("hex");
|
||||
this.workingTable = `${WORKING_TABLE_PREFIX}_${uniqueId}`;
|
||||
|
||||
try {
|
||||
// Setup: Create working table and populate with active document IDs
|
||||
await this.setupWorkingTable(activityThreshold);
|
||||
|
||||
const activeCount = await this.getWorkingTableCount();
|
||||
|
||||
if (activeCount === 0) {
|
||||
Logger.info("task", "No documents with recent activity found");
|
||||
return;
|
||||
}
|
||||
|
||||
Logger.info(
|
||||
"task",
|
||||
`Found ${activeCount} documents with recent activity`
|
||||
);
|
||||
|
||||
// Process documents in independent batches
|
||||
let totalUpdated = 0;
|
||||
let totalErrors = 0;
|
||||
let batchNumber = 0;
|
||||
|
||||
while (true) {
|
||||
const remaining = await this.getWorkingTableCount();
|
||||
if (remaining === 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
batchNumber++;
|
||||
|
||||
try {
|
||||
const updated = await this.processBatchWithRetry(
|
||||
activityThreshold,
|
||||
now
|
||||
);
|
||||
totalUpdated += updated;
|
||||
|
||||
Logger.debug(
|
||||
"task",
|
||||
`Batch ${batchNumber}: updated ${updated} documents, ${remaining - updated} remaining`
|
||||
);
|
||||
} catch (error) {
|
||||
totalErrors++;
|
||||
Logger.error(`Batch ${batchNumber} failed after retries`, error);
|
||||
|
||||
// Remove failed batch from working table to prevent infinite loop
|
||||
await this.skipCurrentBatch();
|
||||
}
|
||||
}
|
||||
|
||||
Logger.info(
|
||||
"task",
|
||||
`Completed updating popularity scores: ${totalUpdated} documents updated, ${totalErrors} batch errors`
|
||||
);
|
||||
} catch (error) {
|
||||
Logger.error("Failed to update document popularity scores", error);
|
||||
throw error;
|
||||
} finally {
|
||||
// Always clean up the working table
|
||||
await this.cleanupWorkingTable();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an unlogged working table and populates it with document IDs
|
||||
* that have recent activity. Unlogged tables are faster because they
|
||||
* skip WAL logging, and data loss on crash is acceptable here.
|
||||
*/
|
||||
private async setupWorkingTable(activityThreshold: Date): Promise<void> {
|
||||
// Drop any existing table first to avoid type conflicts from previous crashed runs
|
||||
await sequelize.query(`DROP TABLE IF EXISTS ${this.workingTable} CASCADE`);
|
||||
|
||||
// Create unlogged table - faster than regular tables as it skips WAL logging
|
||||
await sequelize.query(`
|
||||
CREATE UNLOGGED TABLE ${this.workingTable} (
|
||||
"documentId" UUID PRIMARY KEY,
|
||||
processed BOOLEAN DEFAULT FALSE
|
||||
)
|
||||
`);
|
||||
|
||||
// Populate with documents that have recent activity and are valid
|
||||
// (published, not deleted). Using JOINs to filter upfront.
|
||||
await sequelize.query(
|
||||
`
|
||||
INSERT INTO ${this.workingTable} ("documentId")
|
||||
SELECT DISTINCT d.id
|
||||
FROM documents d
|
||||
WHERE d."publishedAt" IS NOT NULL
|
||||
AND d."deletedAt" IS NULL
|
||||
AND (
|
||||
EXISTS (
|
||||
SELECT 1 FROM revisions r
|
||||
WHERE r."documentId" = d.id AND r."createdAt" >= :threshold
|
||||
)
|
||||
OR EXISTS (
|
||||
SELECT 1 FROM comments c
|
||||
WHERE c."documentId" = d.id AND c."createdAt" >= :threshold
|
||||
)
|
||||
OR EXISTS (
|
||||
SELECT 1 FROM views v
|
||||
WHERE v."documentId" = d.id AND v."updatedAt" >= :threshold
|
||||
)
|
||||
)
|
||||
`,
|
||||
{ replacements: { threshold: activityThreshold } }
|
||||
);
|
||||
|
||||
// Create index on processed column for efficient batch selection
|
||||
await sequelize.query(`
|
||||
CREATE INDEX ON ${this.workingTable} (processed) WHERE NOT processed
|
||||
`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns count of unprocessed documents in working table
|
||||
*/
|
||||
private async getWorkingTableCount(): Promise<number> {
|
||||
const [result] = await sequelize.query<{ count: string }>(
|
||||
`SELECT COUNT(*) as count FROM ${this.workingTable} WHERE NOT processed`,
|
||||
{ type: QueryTypes.SELECT }
|
||||
);
|
||||
return parseInt(result.count, 10);
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes a batch of documents with retry logic.
|
||||
* Each batch is an independent transaction that commits on success.
|
||||
*/
|
||||
private async processBatchWithRetry(
|
||||
activityThreshold: Date,
|
||||
now: Date,
|
||||
attempt = 1
|
||||
): Promise<number> {
|
||||
try {
|
||||
return await sequelize.transaction(async (transaction) => {
|
||||
// Select and lock a batch of unprocessed documents
|
||||
const batch = await sequelize.query<{ documentId: string }>(
|
||||
`
|
||||
SELECT "documentId" FROM ${this.workingTable}
|
||||
WHERE NOT processed
|
||||
ORDER BY "documentId"
|
||||
LIMIT :limit
|
||||
FOR UPDATE SKIP LOCKED
|
||||
`,
|
||||
{
|
||||
replacements: { limit: BATCH_SIZE },
|
||||
type: QueryTypes.SELECT,
|
||||
transaction,
|
||||
}
|
||||
);
|
||||
|
||||
if (batch.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const documentIds = batch.map((b) => b.documentId);
|
||||
|
||||
// Build VALUES clause for the batch, sequelize did not like array parameters in casted in clause.
|
||||
const valuesClause = documentIds
|
||||
.map((id) => `('${id}'::uuid)`)
|
||||
.join(", ");
|
||||
|
||||
// Calculate and update scores using JOINs (no IN clause with large arrays)
|
||||
await sequelize.query(
|
||||
`
|
||||
WITH batch_docs AS (
|
||||
SELECT * FROM (VALUES ${valuesClause}) AS t(id)
|
||||
),
|
||||
revision_scores AS (
|
||||
SELECT
|
||||
r."documentId",
|
||||
SUM(:revisionWeight / POWER(
|
||||
GREATEST(EXTRACT(EPOCH FROM (:now::timestamp - r."createdAt")) / 3600 + :timeOffset, 0.1),
|
||||
:gravity
|
||||
)) as score
|
||||
FROM revisions r
|
||||
INNER JOIN batch_docs bd ON r."documentId" = bd.id
|
||||
WHERE r."createdAt" >= :threshold
|
||||
GROUP BY r."documentId"
|
||||
),
|
||||
comment_scores AS (
|
||||
SELECT
|
||||
c."documentId",
|
||||
SUM(:commentWeight / POWER(
|
||||
GREATEST(EXTRACT(EPOCH FROM (:now::timestamp - c."createdAt")) / 3600 + :timeOffset, 0.1),
|
||||
:gravity
|
||||
)) as score
|
||||
FROM comments c
|
||||
INNER JOIN batch_docs bd ON c."documentId" = bd.id
|
||||
WHERE c."createdAt" >= :threshold
|
||||
GROUP BY c."documentId"
|
||||
),
|
||||
view_scores AS (
|
||||
SELECT
|
||||
v."documentId",
|
||||
SUM(:viewWeight / POWER(
|
||||
GREATEST(EXTRACT(EPOCH FROM (:now::timestamp - v."updatedAt")) / 3600 + :timeOffset, 0.1),
|
||||
:gravity
|
||||
)) as score
|
||||
FROM views v
|
||||
INNER JOIN batch_docs bd ON v."documentId" = bd.id
|
||||
WHERE v."updatedAt" >= :threshold
|
||||
GROUP BY v."documentId"
|
||||
),
|
||||
combined_scores AS (
|
||||
SELECT
|
||||
bd.id as "documentId",
|
||||
COALESCE(rs.score, 0) + COALESCE(cs.score, 0) + COALESCE(vs.score, 0) as total_score
|
||||
FROM batch_docs bd
|
||||
LEFT JOIN revision_scores rs ON bd.id = rs."documentId"
|
||||
LEFT JOIN comment_scores cs ON bd.id = cs."documentId"
|
||||
LEFT JOIN view_scores vs ON bd.id = vs."documentId"
|
||||
)
|
||||
UPDATE documents
|
||||
SET "popularityScore" = combined_scores.total_score
|
||||
FROM combined_scores
|
||||
WHERE documents.id = combined_scores."documentId"
|
||||
`,
|
||||
{
|
||||
replacements: {
|
||||
threshold: activityThreshold,
|
||||
now,
|
||||
gravity: GRAVITY,
|
||||
timeOffset: TIME_OFFSET_HOURS,
|
||||
revisionWeight: ACTIVITY_WEIGHTS.revision,
|
||||
commentWeight: ACTIVITY_WEIGHTS.comment,
|
||||
viewWeight: ACTIVITY_WEIGHTS.view,
|
||||
},
|
||||
transaction,
|
||||
}
|
||||
);
|
||||
|
||||
// Mark batch as processed
|
||||
await sequelize.query(
|
||||
`
|
||||
UPDATE ${this.workingTable}
|
||||
SET processed = TRUE
|
||||
WHERE "documentId" IN (SELECT id FROM (VALUES ${valuesClause}) AS t(id))
|
||||
`,
|
||||
{ transaction }
|
||||
);
|
||||
|
||||
return documentIds.length;
|
||||
});
|
||||
} catch (error) {
|
||||
if (attempt < MAX_RETRIES) {
|
||||
Logger.warn(
|
||||
`Batch update failed, retrying (attempt ${attempt + 1}/${MAX_RETRIES})`,
|
||||
{ error }
|
||||
);
|
||||
await this.sleep(RETRY_DELAY_MS * attempt);
|
||||
return this.processBatchWithRetry(activityThreshold, now, attempt + 1);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Marks current batch as processed without updating scores.
|
||||
* Used when a batch fails repeatedly to prevent infinite loops.
|
||||
*/
|
||||
private async skipCurrentBatch(): Promise<void> {
|
||||
await sequelize.query(
|
||||
`
|
||||
UPDATE ${this.workingTable}
|
||||
SET processed = TRUE
|
||||
WHERE "documentId" IN (
|
||||
SELECT "documentId" FROM ${this.workingTable}
|
||||
WHERE NOT processed
|
||||
ORDER BY "documentId"
|
||||
LIMIT :limit
|
||||
)
|
||||
`,
|
||||
{ replacements: { limit: BATCH_SIZE } }
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes the working table
|
||||
*/
|
||||
private async cleanupWorkingTable(): Promise<void> {
|
||||
try {
|
||||
await sequelize.query(
|
||||
`DROP TABLE IF EXISTS ${this.workingTable} CASCADE`
|
||||
);
|
||||
} catch (error) {
|
||||
Logger.warn("Failed to clean up working table", { error });
|
||||
}
|
||||
}
|
||||
|
||||
private sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
}
|
||||
@@ -13,7 +13,14 @@ const DocumentsSortParamsSchema = z.object({
|
||||
sort: z
|
||||
.string()
|
||||
.refine((val) =>
|
||||
["createdAt", "updatedAt", "publishedAt", "index", "title"].includes(val)
|
||||
[
|
||||
"createdAt",
|
||||
"updatedAt",
|
||||
"publishedAt",
|
||||
"index",
|
||||
"title",
|
||||
"popularityScore",
|
||||
].includes(val)
|
||||
)
|
||||
.default("updatedAt"),
|
||||
|
||||
|
||||
@@ -666,6 +666,7 @@
|
||||
"Sorry, an error occurred saving the collection": "Sorry, an error occurred saving the collection",
|
||||
"Add a description": "Add a description",
|
||||
"Overview": "Overview",
|
||||
"Popular": "Popular",
|
||||
"Recently updated": "Recently updated",
|
||||
"Recently published": "Recently published",
|
||||
"Least recently updated": "Least recently updated",
|
||||
@@ -783,7 +784,8 @@
|
||||
"Something went wrong": "Something went wrong",
|
||||
"Sorry, an unknown error occurred loading the page. Please try again or contact support if the issue persists.": "Sorry, an unknown error occurred loading the page. Please try again or contact support if the issue persists.",
|
||||
"Created by me": "Created by me",
|
||||
"Weird, this shouldn’t ever be empty": "Weird, this shouldn’t ever be empty",
|
||||
"Weird, this shouldn't ever be empty": "Weird, this shouldn't ever be empty",
|
||||
"Documents with recent activity will appear here": "Documents with recent activity will appear here",
|
||||
"You haven’t created any documents yet": "You haven’t created any documents yet",
|
||||
"Documents you’ve recently viewed will be here for easy access": "Documents you’ve recently viewed will be here for easy access",
|
||||
"{{ count }} invites sent": "{{ count }} invites sent",
|
||||
|
||||
Reference in New Issue
Block a user