From 9a60adc84d7a0a546d4b093d0f1db99ef45ce137 Mon Sep 17 00:00:00 2001 From: Alex Patterson Date: Thu, 5 Mar 2026 09:16:50 -0500 Subject: [PATCH] feat: add topic dedup check to ingest cron - Add isTopicAlreadyCovered() with GROQ title match + topic overlap - Add isSlugTaken() for slug collision detection - Walk ranked trends list, skip already-covered topics - Add dedupWindowDays field to contentConfig (default 90, 0 to disable) - Pass single selectedTrend to buildPrompt and createSanityDocuments - Graceful degradation: dedup failures don't block pipeline --- app/api/cron/ingest/route.ts | 200 +++++++++++++++++++-- sanity/schemas/singletons/contentConfig.ts | 8 + 2 files changed, 197 insertions(+), 11 deletions(-) diff --git a/app/api/cron/ingest/route.ts b/app/api/cron/ingest/route.ts index fc456f02..2ec77541 100644 --- a/app/api/cron/ingest/route.ts +++ b/app/api/cron/ingest/route.ts @@ -108,6 +108,140 @@ const FALLBACK_TRENDS: TrendResult[] = [ }, ]; +// --------------------------------------------------------------------------- +// Common stop words stripped when extracting search terms for dedup +// --------------------------------------------------------------------------- + +const STOP_WORDS = new Set([ + "the", "a", "an", "is", "of", "in", "for", "and", "to", "how", "why", + "what", "with", "new", "your", "are", "was", "be", "been", "being", + "have", "has", "had", "do", "does", "did", "will", "would", "could", + "should", "may", "might", "must", "shall", "can", "need", "dare", + "ought", "used", "every", "all", "both", "few", "more", "most", + "other", "some", "such", "no", "nor", "not", "only", "own", "same", + "so", "than", "too", "very", "just", "because", "as", "until", + "while", "about", "between", "through", "during", "before", "after", + "above", "below", "from", "up", "down", "out", "on", "off", "over", + "under", "again", "further", "then", "once", "that", "this", "these", + "those", "it", "its", "we", "you", "they", "them", "their", "our", + "my", "he", "she", "him", "her", "his", "who", "which", "when", + "where", "there", "here", +]); + +// --------------------------------------------------------------------------- +// Topic Dedup — check if a topic has already been covered recently +// --------------------------------------------------------------------------- + +/** + * Extract 2-3 meaningful search terms from a topic title. + * Strips stop words and short tokens, returns lowercase terms. + */ +function extractSearchTerms(title: string): string[] { + const words = title + .toLowerCase() + .replace(/[^a-z0-9\s.-]/g, " ") + .split(/\s+/) + .filter((w) => w.length > 2 && !STOP_WORDS.has(w)); + + // Return up to 3 most meaningful terms (first terms tend to be most specific) + return words.slice(0, 3); +} + +/** + * Check whether a topic (by title + slug) has already been covered within + * the configured dedup window. Queries both `contentIdea` and `automatedVideo` + * documents in Sanity. + * + * Returns `true` if the topic should be skipped (already covered). + */ +async function isTopicAlreadyCovered(topic: string, topics: string[]): Promise { + const dedupWindowDays = await getConfigValue("content_config", "dedupWindowDays", 90); + + // Dedup disabled when window is 0 + if (dedupWindowDays <= 0) { + return false; + } + + const cutoff = new Date(); + cutoff.setDate(cutoff.getDate() - dedupWindowDays); + const cutoffISO = cutoff.toISOString(); + + const searchTerms = extractSearchTerms(topic); + if (searchTerms.length === 0) { + return false; + } + + // Build a GROQ match pattern — each term becomes a wildcard prefix match + // GROQ `match` supports patterns like "react*" and works with || + const matchPatterns = searchTerms.map((t) => `${t}*`); + + // Query 1: Title-based match on contentIdea and automatedVideo + // The `match` operator in GROQ does case-insensitive prefix matching + const titleQuery = `{ + "ideas": *[_type == "contentIdea" && _createdAt > $cutoff && title match $patterns] { _id, title, topics }, + "videos": *[_type == "automatedVideo" && _createdAt > $cutoff && title match $patterns] { _id, title } + }`; + + try { + const titleResults = await writeClient.fetch(titleQuery, { + cutoff: cutoffISO, + patterns: matchPatterns, + }); + + const ideaMatches: Array<{ _id: string; title: string; topics?: string[] }> = titleResults.ideas ?? []; + const videoMatches: Array<{ _id: string; title: string }> = titleResults.videos ?? []; + + // If any title match is found, topic is covered + if (ideaMatches.length > 0 || videoMatches.length > 0) { + console.log( + `[CRON/ingest] Dedup: title match found for "${topic}" — ${ideaMatches.length} ideas, ${videoMatches.length} videos`, + ); + return true; + } + + // Query 2: Check topic tag overlap on contentIdea documents + // We consider a topic covered if 2+ tags overlap + if (topics.length > 0) { + const topicLower = topics.map((t) => t.toLowerCase()); + const overlapQuery = `*[_type == "contentIdea" && _createdAt > $cutoff && count((topics[])[@ in $topicTags]) >= 2] { _id, title, topics }`; + + const overlapResults = await writeClient.fetch(overlapQuery, { + cutoff: cutoffISO, + topicTags: topicLower, + }); + + if (overlapResults.length > 0) { + console.log( + `[CRON/ingest] Dedup: topic overlap found for "${topic}" — ${overlapResults.length} matching ideas`, + ); + return true; + } + } + + return false; + } catch (err) { + // If dedup query fails, don't block the pipeline — log and continue + console.warn("[CRON/ingest] Dedup query failed, allowing topic:", err); + return false; + } +} + +/** + * Check if a slug already exists on an automatedVideo document. + */ +async function isSlugTaken(slug: string): Promise { + try { + const results = await writeClient.fetch( + `*[_type == "automatedVideo" && slug.current == $slug][0..0] { _id }`, + { slug }, + ); + return results.length > 0; + } catch (err) { + console.warn("[CRON/ingest] Slug check failed, allowing:", err); + return false; + } +} + // --------------------------------------------------------------------------- // Gemini Script Generation // --------------------------------------------------------------------------- @@ -332,7 +466,7 @@ Respond with ONLY the JSON object.`, async function createSanityDocuments( script: GeneratedScript, criticResult: CriticResult, - trends: TrendResult[], + selectedTrend: TrendResult, qualityThreshold: number, research?: ResearchPayload, researchMeta?: { notebookId: string; taskId: string }, @@ -374,8 +508,8 @@ async function createSanityDocuments( ...(isFlagged && { flaggedReason: `Quality score ${criticResult.score}/100. Issues: ${criticResult.issues.join("; ") || "Low quality score"}`, }), - trendScore: trends[0]?.score, - trendSources: trends[0]?.signals.map(s => s.source).join(", "), + trendScore: selectedTrend.score, + trendSources: selectedTrend.signals.map(s => s.source).join(", "), researchNotebookId: researchMeta?.notebookId ?? research?.notebookId, ...(researchMeta?.taskId && { researchTaskId: researchMeta.taskId }), }); @@ -442,23 +576,66 @@ export async function GET(request: NextRequest) { trends = FALLBACK_TRENDS; } - // Step 2: Optional deep research on top topic (fire-and-forget) + // Step 1.5: Dedup — walk trends list, skip already-covered topics + console.log("[CRON/ingest] Dedup: checking trends for already-covered topics..."); + let selectedTrend: TrendResult | undefined; + let skippedCount = 0; + + for (const trend of trends) { + // Extract keyword-style topics from the trend title for tag overlap check + const topicKeywords = extractSearchTerms(trend.topic); + + const covered = await isTopicAlreadyCovered(trend.topic, topicKeywords); + if (covered) { + console.log(`[CRON/ingest] Dedup: skipping "${trend.topic}" (score: ${trend.score}) — already covered`); + skippedCount++; + continue; + } + + // Also check for slug collision + if (trend.slug) { + const slugTaken = await isSlugTaken(trend.slug); + if (slugTaken) { + console.log(`[CRON/ingest] Dedup: skipping "${trend.topic}" — slug "${trend.slug}" already exists`); + skippedCount++; + continue; + } + } + + selectedTrend = trend; + break; + } + + if (!selectedTrend) { + console.log(`[CRON/ingest] Dedup: all ${trends.length} trending topics already covered. Skipping ingestion.`); + return Response.json({ + success: true, + skipped: true, + message: "All trending topics already covered", + trendCount: trends.length, + skippedCount, + }); + } + + console.log(`[CRON/ingest] Dedup: selected "${selectedTrend.topic}" (score: ${selectedTrend.score}, skipped ${skippedCount} topics)`); + + // Step 2: Optional deep research on selected topic (fire-and-forget) // When research is enabled, we create a notebook and start research // but DON'T wait for it — the check-research cron will poll and enrich later let researchMeta: { notebookId: string; taskId: string } | undefined; if (enableNotebookLmResearch) { - console.log(`[CRON/ingest] Starting fire-and-forget research on: "${trends[0].topic}"...`); + console.log(`[CRON/ingest] Starting fire-and-forget research on: "${selectedTrend.topic}"...`); try { const auth = await initAuth(); const nbClient = new NotebookLMClient(auth); // Create notebook - const notebook = await nbClient.createNotebook(trends[0].topic); + const notebook = await nbClient.createNotebook(selectedTrend.topic); const notebookId = notebook.id; console.log(`[CRON/ingest] Created notebook: ${notebookId}`); // Add source URLs from trend signals - const sourceUrls = (trends[0].signals ?? []) + const sourceUrls = (selectedTrend.signals ?? []) .map((s: { url?: string }) => s.url) .filter((u): u is string => !!u && u.startsWith("http")) .slice(0, 5); @@ -471,7 +648,7 @@ export async function GET(request: NextRequest) { console.log(`[CRON/ingest] Added ${sourceUrls.length} source URLs to notebook`); // Start deep research (fire-and-forget — don't poll!) - const researchTask = await nbClient.startResearch(notebookId, trends[0].topic, "deep"); + const researchTask = await nbClient.startResearch(notebookId, selectedTrend.topic, "deep"); const researchTaskId = researchTask?.taskId ?? ""; console.log(`[CRON/ingest] Research started — taskId: ${researchTaskId}. check-research cron will poll.`); @@ -484,7 +661,7 @@ export async function GET(request: NextRequest) { // Step 3: Generate script with Gemini (basic — without research data) // When research is enabled, check-research will re-generate an enriched script later console.log("[CRON/ingest] Generating script with Gemini..."); - const prompt = buildPrompt(trends); + const prompt = buildPrompt([selectedTrend]); const rawResponse = await generateWithGemini(prompt, SYSTEM_INSTRUCTION); let script: GeneratedScript; @@ -515,7 +692,7 @@ export async function GET(request: NextRequest) { ); console.log("[CRON/ingest] Creating Sanity documents..."); - const result = await createSanityDocuments(script, criticResult, trends, qualityThreshold, undefined, researchMeta); + const result = await createSanityDocuments(script, criticResult, selectedTrend, qualityThreshold, undefined, researchMeta); console.log("[CRON/ingest] Done!", result); @@ -525,7 +702,8 @@ export async function GET(request: NextRequest) { title: script.title, criticScore: criticResult.score, trendCount: trends.length, - trendScore: trends[0]?.score, + trendScore: selectedTrend.score, + skippedCount, researchStarted: !!researchMeta, researchNotebookId: researchMeta?.notebookId, }); diff --git a/sanity/schemas/singletons/contentConfig.ts b/sanity/schemas/singletons/contentConfig.ts index 6c67db02..b5d42d17 100644 --- a/sanity/schemas/singletons/contentConfig.ts +++ b/sanity/schemas/singletons/contentConfig.ts @@ -105,6 +105,14 @@ export default defineType({ description: "Maximum number of scenes per video. Keeps videos focused and within duration targets", initialValue: 5, }), + defineField({ + name: "dedupWindowDays", + title: "Dedup Window (days)", + type: "number", + description: "Number of days to look back when checking for duplicate topics. Topics covered within this window will be skipped during daily ingestion. Set to 0 to disable dedup.", + initialValue: 90, + validation: (Rule) => Rule.min(0).max(365), + }), ], preview: { prepare() {