From 6fc0949a6416c850d7fe77ffd81a3089d10745a9 Mon Sep 17 00:00:00 2001 From: GulSam00 Date: Fri, 27 Mar 2026 23:31:59 +0900 Subject: [PATCH 1/3] =?UTF-8?q?refactor=20:=20=ED=81=AC=EB=A1=A4=EB=A7=81?= =?UTF-8?q?=20=EC=8A=A4=ED=81=AC=EB=A6=BD=ED=8A=B8=20crawling=20=E2=86=92?= =?UTF-8?q?=20cron=20=EB=94=94=EB=A0=89=ED=86=A0=EB=A6=AC=20=EC=9D=B4?= =?UTF-8?q?=EB=8F=99=20(#173)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/crawling/package.json | 7 ++++--- packages/crawling/src/{crawling => cron}/crawlRecentTJ.ts | 0 packages/crawling/src/{crawling => cron}/crawlYoutube.ts | 2 +- .../crawling/src/{crawling => cron}/crawlYoutubeVerify.ts | 5 ++--- 4 files changed, 7 insertions(+), 7 deletions(-) rename packages/crawling/src/{crawling => cron}/crawlRecentTJ.ts (100%) rename packages/crawling/src/{crawling => cron}/crawlYoutube.ts (98%) rename packages/crawling/src/{crawling => cron}/crawlYoutubeVerify.ts (90%) diff --git a/packages/crawling/package.json b/packages/crawling/package.json index e81f2ff..48a1907 100644 --- a/packages/crawling/package.json +++ b/packages/crawling/package.json @@ -8,10 +8,11 @@ }, "scripts": { "ky-open": "tsx src/findKYByOpen.ts", - "ky-youtube": "tsx src/crawling/crawlYoutube.ts", - "ky-verify": "tsx src/crawling/crawlYoutubeVerify.ts", + "ky-youtube": "tsx src/cron/crawlYoutube.ts", + "ky-verify": "tsx src/cron/crawlYoutubeVerify.ts", "ky-update": "pnpm run ky-youtube & pnpm run ky-verify", - "recent-tj": "tsx src/crawling/crawlRecentTJ.ts", + "recent-tj": "tsx src/cron/crawlRecentTJ.ts", + "tag-songs": "tsx src/cron/taggingSongs.ts", "lint": "eslint .", "test": "vitest run", "format": "prettier --write \"**/*.{ts,tsx,md}\"" diff --git a/packages/crawling/src/crawling/crawlRecentTJ.ts b/packages/crawling/src/cron/crawlRecentTJ.ts similarity index 100% rename from packages/crawling/src/crawling/crawlRecentTJ.ts rename to packages/crawling/src/cron/crawlRecentTJ.ts diff --git a/packages/crawling/src/crawling/crawlYoutube.ts b/packages/crawling/src/cron/crawlYoutube.ts similarity index 98% rename from packages/crawling/src/crawling/crawlYoutube.ts rename to packages/crawling/src/cron/crawlYoutube.ts index 9ce215e..39d8d97 100644 --- a/packages/crawling/src/crawling/crawlYoutube.ts +++ b/packages/crawling/src/cron/crawlYoutube.ts @@ -6,7 +6,7 @@ import { postInvalidKYSongsDB } from '@/supabase/postDB'; import { updateSongsKyDB } from '@/supabase/updateDB'; import { Song } from '@/types'; -import { isValidKYExistNumber } from './isValidKYExistNumber'; +import { isValidKYExistNumber } from '../crawling/isValidKYExistNumber'; // --- Constants --- const BASE_YOUTUBE_SEARCH_URL = 'https://www.youtube.com/@KARAOKEKY/search'; diff --git a/packages/crawling/src/crawling/crawlYoutubeVerify.ts b/packages/crawling/src/cron/crawlYoutubeVerify.ts similarity index 90% rename from packages/crawling/src/crawling/crawlYoutubeVerify.ts rename to packages/crawling/src/cron/crawlYoutubeVerify.ts index d4c6ea9..6dc0ee6 100644 --- a/packages/crawling/src/crawling/crawlYoutubeVerify.ts +++ b/packages/crawling/src/cron/crawlYoutubeVerify.ts @@ -4,7 +4,7 @@ import { getSongsKyNotNullDB, getVerifyKySongsDB } from '@/supabase/getDB'; import { postVerifyKySongsDB } from '@/supabase/postDB'; import { updateSongsKyDB } from '@/supabase/updateDB'; -import { isValidKYExistNumber } from './isValidKYExistNumber'; +import { isValidKYExistNumber } from '../crawling/isValidKYExistNumber'; // 기존에 등록된 KY 노래방 번호가 실제로 KY 노래방과 일치하는지 검증 // 유효한 곡은 verify_ky_songs 테이블에 insert @@ -44,9 +44,8 @@ for (const song of data) { } index++; - console.log('crawlYoutubeVerify : ', index); - if (index >= 2000) break; + if (index >= 5000) break; } browser.close(); From 466cc2f5b1951608bdacd09cd963b7a6e5c000c7 Mon Sep 17 00:00:00 2001 From: GulSam00 Date: Fri, 27 Mar 2026 23:32:25 +0900 Subject: [PATCH 2/3] =?UTF-8?q?feat=20:=20AI=20=EA=B8=B0=EB=B0=98=20?= =?UTF-8?q?=EA=B3=A1=20=ED=83=9C=EA=B7=B8=20=EC=9E=90=EB=8F=99=20=EC=B6=94?= =?UTF-8?q?=EC=B6=9C=20=EA=B8=B0=EB=8A=A5=20=EC=B6=94=EA=B0=80=20(#173)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/tagging_song.yml | 43 ++++++++++ packages/crawling/src/cron/taggingSongs.ts | 59 ++++++++++++++ packages/crawling/src/supabase/getDB.ts | 24 ++++++ packages/crawling/src/supabase/postDB.ts | 12 +++ packages/crawling/src/utils/getSongTag.ts | 92 ++++++++++++++++++++++ 5 files changed, 230 insertions(+) create mode 100644 .github/workflows/tagging_song.yml create mode 100644 packages/crawling/src/cron/taggingSongs.ts create mode 100644 packages/crawling/src/utils/getSongTag.ts diff --git a/.github/workflows/tagging_song.yml b/.github/workflows/tagging_song.yml new file mode 100644 index 0000000..10b5111 --- /dev/null +++ b/.github/workflows/tagging_song.yml @@ -0,0 +1,43 @@ +name: Tagging Songs + +on: + schedule: + - cron: "0 14 * * *" # 한국 시간 23:00 실행 (UTC+9 → UTC 14:00) + workflow_dispatch: + +permissions: + contents: write # push 권한을 위해 필요 + +jobs: + run-npm-task: + runs-on: ubuntu-latest + + steps: + - name: Checkout branch + uses: actions/checkout@v4 + + - name: Use Node.js 20 + uses: actions/setup-node@v4 + with: + node-version: "20" + + - name: Install pnpm + uses: pnpm/action-setup@v2 + with: + version: 9 + run_install: false + + - name: Install dependencies + working-directory: packages/crawling + run: pnpm install + + - name: Create .env file + working-directory: packages/crawling + run: | + echo "SUPABASE_URL=${{ secrets.SUPABASE_URL }}" >> .env + echo "SUPABASE_KEY=${{ secrets.SUPABASE_KEY }}" >> .env + echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> .env + + - name: run tagging script - taggingSongs.ts + working-directory: packages/crawling + run: pnpm run tag-songs diff --git a/packages/crawling/src/cron/taggingSongs.ts b/packages/crawling/src/cron/taggingSongs.ts new file mode 100644 index 0000000..2eeb7f5 --- /dev/null +++ b/packages/crawling/src/cron/taggingSongs.ts @@ -0,0 +1,59 @@ +import { getSongTagSongIdsDB, getSongsAllDB } from '@/supabase/getDB'; +import { postSongTagsDB } from '@/supabase/postDB'; +import { autoTagSong } from '@/utils/getSongTag'; + +const resultsLog = { + success: 0, + failed: 0, + skipped: 0, +}; + +// 1. 전체 곡 조회 + 이미 태그된 곡 ID 로드 +const [allSongs, taggedSongIds] = await Promise.all([getSongsAllDB(), getSongTagSongIdsDB()]); + +console.log('전체 곡 수:', allSongs.length); +console.log('이미 태그된 곡 수:', taggedSongIds.size); + +// 2. 순차 순회 (테스트: 5회만 실행) +let processedCount = 0; +for (const song of allSongs) { + if (processedCount >= 5000) break; + if (taggedSongIds.has(song.id)) { + resultsLog.skipped++; + continue; + } + + try { + const tagIds = await autoTagSong(song.title, song.artist); + + if (tagIds.length === 0) { + resultsLog.failed++; + console.log(`[FAIL] ${song.title} - ${song.artist}: 태그 없음`); + continue; + } + + const success = await postSongTagsDB(song.id, tagIds); + if (success) { + resultsLog.success++; + console.log(`[OK] ${song.title} - ${song.artist}: [${tagIds.join(', ')}]`); + } else { + resultsLog.failed++; + } + } catch (error) { + resultsLog.failed++; + console.error(`[ERROR] ${song.title} - ${song.artist}:`, error); + } + + processedCount++; + + // OpenAI rate limit 대비 딜레이 + await new Promise(resolve => setTimeout(resolve, 200)); +} + +// 3. 결과 출력 +console.log(` + 총 ${allSongs.length}곡 중: + - 스킵 (이미 태그됨): ${resultsLog.skipped}곡 + - 성공: ${resultsLog.success}곡 + - 실패: ${resultsLog.failed}곡 +`); diff --git a/packages/crawling/src/supabase/getDB.ts b/packages/crawling/src/supabase/getDB.ts index 81c7a3c..2cb5655 100644 --- a/packages/crawling/src/supabase/getDB.ts +++ b/packages/crawling/src/supabase/getDB.ts @@ -84,3 +84,27 @@ export async function getVerifyKySongsDB(): Promise> { return new Set(data.map(row => row.id)); } + +export async function getSongsAllDB(max: number = 50000) { + const supabase = getClient(); + + const { data, error } = await supabase + .from('songs') + .select('id, title, artist') + .order('created_at', { ascending: false }) + .limit(max); + + if (error) throw error; + + return data; +} + +export async function getSongTagSongIdsDB(): Promise> { + const supabase = getClient(); + + const { data, error } = await supabase.from('song_tags').select('song_id').limit(50000); + + if (error) throw error; + + return new Set(data.map(row => row.song_id)); +} diff --git a/packages/crawling/src/supabase/postDB.ts b/packages/crawling/src/supabase/postDB.ts index d53f82d..23417ae 100644 --- a/packages/crawling/src/supabase/postDB.ts +++ b/packages/crawling/src/supabase/postDB.ts @@ -52,6 +52,18 @@ export async function postVerifyKySongsDB(song: Song) { } } +export async function postSongTagsDB(songId: string, tagIds: number[]) { + const supabase = getClient(); + const rows = tagIds.map(tagId => ({ song_id: songId, tag_id: tagId })); + + const { error } = await supabase.from('song_tags').insert(rows); + if (error) { + console.error('postSongTagsDB error:', error); + return false; + } + return true; +} + export async function postInvalidKYSongsDB(song: Song) { const supabase = getClient(); diff --git a/packages/crawling/src/utils/getSongTag.ts b/packages/crawling/src/utils/getSongTag.ts new file mode 100644 index 0000000..4999846 --- /dev/null +++ b/packages/crawling/src/utils/getSongTag.ts @@ -0,0 +1,92 @@ +import OpenAI from 'openai'; +import dotenv from 'dotenv'; + +import { getClient } from '@/supabase/getClient'; + +dotenv.config(); + +const client = new OpenAI({ + apiKey: process.env.OPENAI_API_KEY, +}); + +// 태그 정보를 담을 타입 정의 +interface Tag { + id: number; + name: string; + category: string; +} + +let cachedTagsPrompt: string | null = null; + +/** + * DB에서 전체 태그 목록을 읽어와 AI 프롬프트용 텍스트로 변환한다. + */ +const getTagsForPrompt = async (): Promise => { + if (cachedTagsPrompt) return cachedTagsPrompt; + + const supabase = getClient(); + const { data: tags, error } = await supabase + .from('tags') + .select('id, name, category') + .order('id'); + + if (error) { + console.error('Error fetching tags:', error); + return ''; + } + + // AI가 읽기 편하게 "ID: 이름 (카테고리)" 형식으로 변환 + cachedTagsPrompt = tags.map((tag: Tag) => `${tag.id}: ${tag.name} (${tag.category})`).join('\n'); + return cachedTagsPrompt; +}; + +/** + * AI를 활용해 노래에 적절한 태그 ID들을 추출한다. + */ +export const autoTagSong = async (title: string, artist: string): Promise => { + try { + // 1단계: 프롬프트용 태그 리스트 준비 + const tagsPrompt = await getTagsForPrompt(); + if (!tagsPrompt) return []; + + // 2단계: OpenAI API 호출 + const response = await client.chat.completions.create({ + model: 'gpt-4o-mini', // 가성비가 좋은 모델 사용 + messages: [ + { + role: 'system', + content: ` + You are a music database expert. Based on the song title and artist, categorize the song by selecting appropriate tag IDs from the provided list. + + Guidelines: + 1. Select at least one tag, but no more than 4. + 2. Prioritize Language (100s), then Genre (200s), then Origin (300s). + 3. If it's Japanese music, ALWAYS include 101 (J-POP). + 4. Be precise. If it's from an Anime, use 302 (애니메이션). + 5. Return only JSON: {"tag_ids": [number, number, ...]} + + Allowed Tags List: + ${tagsPrompt} + `, + }, + { + role: 'user', + content: `Title: "${title}", Artist: "${artist}"`, + }, + ], + response_format: { type: 'json_object' }, + temperature: 0, // 결과의 일관성을 위해 0으로 설정 + max_tokens: 50, // 결과가 짧으므로 토큰 제한 + }); + + const content = response.choices[0].message.content; + if (!content) return []; + + // 3단계: 결과 파싱 및 반환 + const result: { tag_ids: number[] } = JSON.parse(content); + return result.tag_ids; + } catch (error) { + console.error('Error auto-tagging song:', error); + return []; + } +}; From 3023c139028b431bd5605cef102d969386bbddec Mon Sep 17 00:00:00 2001 From: GulSam00 Date: Fri, 27 Mar 2026 23:32:50 +0900 Subject: [PATCH 3/3] =?UTF-8?q?chore=20:=20=EC=9B=8C=ED=81=AC=ED=94=8C?= =?UTF-8?q?=EB=A1=9C=EC=9A=B0=20step=20name=20=EA=B0=9C=EC=84=A0=20?= =?UTF-8?q?=EB=B0=8F=20CLAUDE.md=20=EC=97=85=EB=8D=B0=EC=9D=B4=ED=8A=B8=20?= =?UTF-8?q?(#173)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/crawl_recent_tj.yml | 2 +- .github/workflows/update_ky_youtube.yml | 2 +- .github/workflows/verify_ky_youtube.yml | 2 +- CLAUDE.md | 2 +- apps/web/public/sitemap-0.xml | 2 +- apps/web/src/app/api/search/route.ts | 8 +++--- apps/web/src/app/api/songs/thumb-up/route.ts | 4 +-- packages/crawling/CLAUDE.md | 27 ++++++++++++++++++++ 8 files changed, 38 insertions(+), 11 deletions(-) diff --git a/.github/workflows/crawl_recent_tj.yml b/.github/workflows/crawl_recent_tj.yml index 8e61263..94ce3c0 100644 --- a/.github/workflows/crawl_recent_tj.yml +++ b/.github/workflows/crawl_recent_tj.yml @@ -33,6 +33,6 @@ jobs: echo "SUPABASE_URL=${{ secrets.SUPABASE_URL }}" >> .env echo "SUPABASE_KEY=${{ secrets.SUPABASE_KEY }}" >> .env - - name: run crawl script + - name: run crawl script - crawlRecentTJ.ts working-directory: packages/crawling run: pnpm run recent-tj diff --git a/.github/workflows/update_ky_youtube.yml b/.github/workflows/update_ky_youtube.yml index 83ddbe7..4354cfb 100644 --- a/.github/workflows/update_ky_youtube.yml +++ b/.github/workflows/update_ky_youtube.yml @@ -38,6 +38,6 @@ jobs: echo "SUPABASE_KEY=${{ secrets.SUPABASE_KEY }}" >> .env echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> .env - - name: run update script - packages/crawling/crawlYoutube.ts + - name: run update script - crawlYoutube.ts working-directory: packages/crawling run: pnpm run ky-youtube diff --git a/.github/workflows/verify_ky_youtube.yml b/.github/workflows/verify_ky_youtube.yml index 2d9f4fe..e65f7e8 100644 --- a/.github/workflows/verify_ky_youtube.yml +++ b/.github/workflows/verify_ky_youtube.yml @@ -38,6 +38,6 @@ jobs: echo "SUPABASE_KEY=${{ secrets.SUPABASE_KEY }}" >> .env echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> .env - - name: run verify script - packages/crawling + - name: run verify script - crawlYoutubeVerify.ts working-directory: packages/crawling run: pnpm run ky-verify diff --git a/CLAUDE.md b/CLAUDE.md index ad72a5a..c5e5abb 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -44,7 +44,7 @@ packages/ eslint-config/ — Shared ESLint config (@repo/eslint-config) format-config/ — Shared Prettier config (@repo/format-config) typescript-config/ — Shared tsconfig bases - crawling/ — One-off data crawling scripts (not a published package) + crawling/ — Data crawling & tagging scripts (see packages/crawling/CLAUDE.md) ``` ## Web App Architecture diff --git a/apps/web/public/sitemap-0.xml b/apps/web/public/sitemap-0.xml index 73fd5f7..c905ea8 100644 --- a/apps/web/public/sitemap-0.xml +++ b/apps/web/public/sitemap-0.xml @@ -1,4 +1,4 @@ -https://www.singcode.kr2026-03-25T14:32:28.966Zweekly0.7 +https://www.singcode.kr2026-03-27T14:29:45.638Zweekly0.7 \ No newline at end of file diff --git a/apps/web/src/app/api/search/route.ts b/apps/web/src/app/api/search/route.ts index 7b86bd3..1cb66f5 100644 --- a/apps/web/src/app/api/search/route.ts +++ b/apps/web/src/app/api/search/route.ts @@ -6,9 +6,11 @@ import { SearchSong, Song } from '@/types/song'; import { getAuthenticatedUser } from '@/utils/getAuthenticatedUser'; interface DBSong extends Song { - thumb_logs: { - thumb_count: number; - }[] | null; + thumb_logs: + | { + thumb_count: number; + }[] + | null; tosings: { user_id: string; }[]; diff --git a/apps/web/src/app/api/songs/thumb-up/route.ts b/apps/web/src/app/api/songs/thumb-up/route.ts index d8a2061..db5d6a6 100644 --- a/apps/web/src/app/api/songs/thumb-up/route.ts +++ b/apps/web/src/app/api/songs/thumb-up/route.ts @@ -30,9 +30,7 @@ export async function GET(): Promise>> { } // 3) 상위 50개 song_id 추출 - const sorted = [...thumbMap.entries()] - .sort((a, b) => b[1] - a[1]) - .slice(0, 50); + const sorted = [...thumbMap.entries()].sort((a, b) => b[1] - a[1]).slice(0, 50); const songIds = sorted.map(([songId]) => songId); diff --git a/packages/crawling/CLAUDE.md b/packages/crawling/CLAUDE.md index 7136ffc..3bdf7d0 100644 --- a/packages/crawling/CLAUDE.md +++ b/packages/crawling/CLAUDE.md @@ -13,6 +13,8 @@ pnpm ky-open # Open API(금영)로 KY 번호 수집 pnpm ky-youtube # YouTube 크롤링으로 KY 번호 수집 + AI 검증 pnpm ky-verify # 기존 KY 번호의 실제 존재 여부 재검증 (체크포인트 지원) pnpm ky-update # ky-youtube + ky-verify 병렬 실행 +pnpm recent-tj # TJ 최신곡 크롤링 +pnpm tag-songs # AI 기반 곡 자동 태깅 pnpm test # vitest 실행 pnpm lint # ESLint ``` @@ -94,8 +96,33 @@ findKYByOpen.ts | ------------------ | -------------------------------- | | `songs` | 메인 곡 데이터 (TJ/KY 번호 포함) | | `invalid_ky_songs` | KY 번호 수집 실패 목록 | +| `tags` | 태그 마스터 (id, name, category) | +| `song_tags` | 곡-태그 매핑 (song_id, tag_id) | +| `verify_ky_songs` | KY 번호 검증 완료 목록 | ### AI 유틸 - `utils/validateSongMatch.ts` — `gpt-4o-mini`로 두 (제목, 아티스트) 쌍이 같은 곡인지 판단. `temperature: 0`, `max_tokens: 20`, 완전 일치 시 API 호출 생략. - `utils/transChatGPT.ts` — `gpt-4-turbo`로 일본어 → 한국어 번역. +- `utils/getSongTag.ts` — `gpt-4o-mini`로 곡에 적절한 태그 ID 자동 할당. DB의 `tags` 테이블에서 태그 목록을 캐싱하여 프롬프트에 포함. + +### 곡 태깅 파이프라인 + +``` +taggingSongs.ts + └─ getSongsAllDB() # 전체 곡 조회 + └─ getSongTagSongIdsDB() # 이미 태그된 곡 ID Set 로드 (스킵 처리) + └─ autoTagSong(title, artist) # AI로 태그 ID 추출 (1~4개) + └─ postSongTagsDB(songId, tagIds) # song_tags 테이블에 insert +``` + +### GitHub Actions 워크플로우 + +| 워크플로우 파일 | 스케줄 (UTC) | 실행 스크립트 | +| ----------------------- | ------------ | -------------------- | +| `crawl_recent_tj.yml` | 매일 14:00 | `pnpm recent-tj` | +| `tagging_song.yml` | 매일 14:00 | `pnpm tag-songs` | +| `update_ky_youtube.yml` | 수동 | `pnpm ky-youtube` | +| `verify_ky_youtube.yml` | 수동 | `pnpm ky-verify` | + +모든 워크플로우는 `workflow_dispatch`로 수동 실행도 가능하다.