Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/crawl_recent_tj.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,6 @@ jobs:
echo "SUPABASE_URL=${{ secrets.SUPABASE_URL }}" >> .env
echo "SUPABASE_KEY=${{ secrets.SUPABASE_KEY }}" >> .env

- name: run crawl script
- name: run crawl script - crawlRecentTJ.ts
working-directory: packages/crawling
run: pnpm run recent-tj
43 changes: 43 additions & 0 deletions .github/workflows/tagging_song.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: Tagging Songs

on:
schedule:
- cron: "0 14 * * *" # 한국 시간 23:00 실행 (UTC+9 → UTC 14:00)
workflow_dispatch:

permissions:
contents: write # push 권한을 위해 필요

jobs:
run-npm-task:
runs-on: ubuntu-latest

steps:
- name: Checkout branch
uses: actions/checkout@v4

- name: Use Node.js 20
uses: actions/setup-node@v4
with:
node-version: "20"

- name: Install pnpm
uses: pnpm/action-setup@v2
with:
version: 9
run_install: false

- name: Install dependencies
working-directory: packages/crawling
run: pnpm install

- name: Create .env file
working-directory: packages/crawling
run: |
echo "SUPABASE_URL=${{ secrets.SUPABASE_URL }}" >> .env
echo "SUPABASE_KEY=${{ secrets.SUPABASE_KEY }}" >> .env
echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> .env

- name: run tagging script - taggingSongs.ts
working-directory: packages/crawling
run: pnpm run tag-songs
2 changes: 1 addition & 1 deletion .github/workflows/update_ky_youtube.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,6 @@ jobs:
echo "SUPABASE_KEY=${{ secrets.SUPABASE_KEY }}" >> .env
echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> .env

- name: run update script - packages/crawling/crawlYoutube.ts
- name: run update script - crawlYoutube.ts
working-directory: packages/crawling
run: pnpm run ky-youtube
2 changes: 1 addition & 1 deletion .github/workflows/verify_ky_youtube.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,6 @@ jobs:
echo "SUPABASE_KEY=${{ secrets.SUPABASE_KEY }}" >> .env
echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> .env

- name: run verify script - packages/crawling
- name: run verify script - crawlYoutubeVerify.ts
working-directory: packages/crawling
run: pnpm run ky-verify
2 changes: 1 addition & 1 deletion CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ packages/
eslint-config/ — Shared ESLint config (@repo/eslint-config)
format-config/ — Shared Prettier config (@repo/format-config)
typescript-config/ — Shared tsconfig bases
crawling/ — One-off data crawling scripts (not a published package)
crawling/ — Data crawling & tagging scripts (see packages/crawling/CLAUDE.md)
```

## Web App Architecture
Expand Down
2 changes: 1 addition & 1 deletion apps/web/public/sitemap-0.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:mobile="http://www.google.com/schemas/sitemap-mobile/1.0" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" xmlns:video="http://www.google.com/schemas/sitemap-video/1.1">
<url><loc>https://www.singcode.kr</loc><lastmod>2026-03-25T14:32:28.966Z</lastmod><changefreq>weekly</changefreq><priority>0.7</priority></url>
<url><loc>https://www.singcode.kr</loc><lastmod>2026-03-27T14:29:45.638Z</lastmod><changefreq>weekly</changefreq><priority>0.7</priority></url>
</urlset>
8 changes: 5 additions & 3 deletions apps/web/src/app/api/search/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ import { SearchSong, Song } from '@/types/song';
import { getAuthenticatedUser } from '@/utils/getAuthenticatedUser';

interface DBSong extends Song {
thumb_logs: {
thumb_count: number;
}[] | null;
thumb_logs:
| {
thumb_count: number;
}[]
| null;
tosings: {
user_id: string;
}[];
Expand Down
4 changes: 1 addition & 3 deletions apps/web/src/app/api/songs/thumb-up/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,7 @@ export async function GET(): Promise<NextResponse<ApiResponse<ThumbUpSong[]>>> {
}

// 3) 상위 50개 song_id 추출
const sorted = [...thumbMap.entries()]
.sort((a, b) => b[1] - a[1])
.slice(0, 50);
const sorted = [...thumbMap.entries()].sort((a, b) => b[1] - a[1]).slice(0, 50);

const songIds = sorted.map(([songId]) => songId);

Expand Down
27 changes: 27 additions & 0 deletions packages/crawling/CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ pnpm ky-open # Open API(금영)로 KY 번호 수집
pnpm ky-youtube # YouTube 크롤링으로 KY 번호 수집 + AI 검증
pnpm ky-verify # 기존 KY 번호의 실제 존재 여부 재검증 (체크포인트 지원)
pnpm ky-update # ky-youtube + ky-verify 병렬 실행
pnpm recent-tj # TJ 최신곡 크롤링
pnpm tag-songs # AI 기반 곡 자동 태깅
pnpm test # vitest 실행
pnpm lint # ESLint
```
Expand Down Expand Up @@ -94,8 +96,33 @@ findKYByOpen.ts
| ------------------ | -------------------------------- |
| `songs` | 메인 곡 데이터 (TJ/KY 번호 포함) |
| `invalid_ky_songs` | KY 번호 수집 실패 목록 |
| `tags` | 태그 마스터 (id, name, category) |
| `song_tags` | 곡-태그 매핑 (song_id, tag_id) |
| `verify_ky_songs` | KY 번호 검증 완료 목록 |

### AI 유틸

- `utils/validateSongMatch.ts` — `gpt-4o-mini`로 두 (제목, 아티스트) 쌍이 같은 곡인지 판단. `temperature: 0`, `max_tokens: 20`, 완전 일치 시 API 호출 생략.
- `utils/transChatGPT.ts` — `gpt-4-turbo`로 일본어 → 한국어 번역.
- `utils/getSongTag.ts` — `gpt-4o-mini`로 곡에 적절한 태그 ID 자동 할당. DB의 `tags` 테이블에서 태그 목록을 캐싱하여 프롬프트에 포함.

### 곡 태깅 파이프라인

```
taggingSongs.ts
└─ getSongsAllDB() # 전체 곡 조회
└─ getSongTagSongIdsDB() # 이미 태그된 곡 ID Set 로드 (스킵 처리)
└─ autoTagSong(title, artist) # AI로 태그 ID 추출 (1~4개)
└─ postSongTagsDB(songId, tagIds) # song_tags 테이블에 insert
```

### GitHub Actions 워크플로우

| 워크플로우 파일 | 스케줄 (UTC) | 실행 스크립트 |
| ----------------------- | ------------ | -------------------- |
| `crawl_recent_tj.yml` | 매일 14:00 | `pnpm recent-tj` |
| `tagging_song.yml` | 매일 14:00 | `pnpm tag-songs` |
| `update_ky_youtube.yml` | 수동 | `pnpm ky-youtube` |
| `verify_ky_youtube.yml` | 수동 | `pnpm ky-verify` |

모든 워크플로우는 `workflow_dispatch`로 수동 실행도 가능하다.
7 changes: 4 additions & 3 deletions packages/crawling/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
},
"scripts": {
"ky-open": "tsx src/findKYByOpen.ts",
"ky-youtube": "tsx src/crawling/crawlYoutube.ts",
"ky-verify": "tsx src/crawling/crawlYoutubeVerify.ts",
"ky-youtube": "tsx src/cron/crawlYoutube.ts",
"ky-verify": "tsx src/cron/crawlYoutubeVerify.ts",
"ky-update": "pnpm run ky-youtube & pnpm run ky-verify",
"recent-tj": "tsx src/crawling/crawlRecentTJ.ts",
"recent-tj": "tsx src/cron/crawlRecentTJ.ts",
"tag-songs": "tsx src/cron/taggingSongs.ts",
"lint": "eslint .",
"test": "vitest run",
"format": "prettier --write \"**/*.{ts,tsx,md}\""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { postInvalidKYSongsDB } from '@/supabase/postDB';
import { updateSongsKyDB } from '@/supabase/updateDB';
import { Song } from '@/types';

import { isValidKYExistNumber } from './isValidKYExistNumber';
import { isValidKYExistNumber } from '../crawling/isValidKYExistNumber';

// --- Constants ---
const BASE_YOUTUBE_SEARCH_URL = 'https://www.youtube.com/@KARAOKEKY/search';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { getSongsKyNotNullDB, getVerifyKySongsDB } from '@/supabase/getDB';
import { postVerifyKySongsDB } from '@/supabase/postDB';
import { updateSongsKyDB } from '@/supabase/updateDB';

import { isValidKYExistNumber } from './isValidKYExistNumber';
import { isValidKYExistNumber } from '../crawling/isValidKYExistNumber';

// 기존에 등록된 KY 노래방 번호가 실제로 KY 노래방과 일치하는지 검증
// 유효한 곡은 verify_ky_songs 테이블에 insert
Expand Down Expand Up @@ -44,9 +44,8 @@ for (const song of data) {
}

index++;
console.log('crawlYoutubeVerify : ', index);

if (index >= 2000) break;
if (index >= 5000) break;
}

browser.close();
59 changes: 59 additions & 0 deletions packages/crawling/src/cron/taggingSongs.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import { getSongTagSongIdsDB, getSongsAllDB } from '@/supabase/getDB';
import { postSongTagsDB } from '@/supabase/postDB';
import { autoTagSong } from '@/utils/getSongTag';

const resultsLog = {
success: 0,
failed: 0,
skipped: 0,
};

// 1. 전체 곡 조회 + 이미 태그된 곡 ID 로드
const [allSongs, taggedSongIds] = await Promise.all([getSongsAllDB(), getSongTagSongIdsDB()]);

console.log('전체 곡 수:', allSongs.length);
console.log('이미 태그된 곡 수:', taggedSongIds.size);

// 2. 순차 순회 (테스트: 5회만 실행)
let processedCount = 0;
for (const song of allSongs) {
if (processedCount >= 5000) break;
if (taggedSongIds.has(song.id)) {
Comment on lines +17 to +21
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Action required

1. taggingsongs.ts stops at 5000 📎 Requirement gap ≡ Correctness

The new tagging cron stops iterating after processing 5000 songs, which can leave the remaining
songs unprocessed when the song list exceeds that size. This violates the requirement to reliably
traverse the entire song list for per-song tag extraction.
Agent Prompt
## Issue description
`packages/crawling/src/cron/taggingSongs.ts` breaks out of the main loop when `processedCount >= 5000`, which prevents processing the full song list.

## Issue Context
Compliance requires iterating through the entire song list predictably; a hard-coded cap causes unintentional partial processing as the dataset grows.

## Fix Focus Areas
- packages/crawling/src/cron/taggingSongs.ts[17-21]

ⓘ Copy this prompt and use it to remediate the issue with your preferred AI generation tools

resultsLog.skipped++;
continue;
}

try {
const tagIds = await autoTagSong(song.title, song.artist);

if (tagIds.length === 0) {
resultsLog.failed++;
console.log(`[FAIL] ${song.title} - ${song.artist}: 태그 없음`);
continue;
}

const success = await postSongTagsDB(song.id, tagIds);
if (success) {
resultsLog.success++;
console.log(`[OK] ${song.title} - ${song.artist}: [${tagIds.join(', ')}]`);
} else {
resultsLog.failed++;
}
} catch (error) {
resultsLog.failed++;
console.error(`[ERROR] ${song.title} - ${song.artist}:`, error);
}

processedCount++;

// OpenAI rate limit 대비 딜레이
await new Promise(resolve => setTimeout(resolve, 200));
}

// 3. 결과 출력
console.log(`
총 ${allSongs.length}곡 중:
- 스킵 (이미 태그됨): ${resultsLog.skipped}곡
- 성공: ${resultsLog.success}곡
- 실패: ${resultsLog.failed}곡
`);
24 changes: 24 additions & 0 deletions packages/crawling/src/supabase/getDB.ts
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,27 @@ export async function getVerifyKySongsDB(): Promise<Set<string>> {

return new Set(data.map(row => row.id));
}

export async function getSongsAllDB(max: number = 50000) {
const supabase = getClient();

const { data, error } = await supabase
.from('songs')
.select('id, title, artist')
.order('created_at', { ascending: false })
.limit(max);

if (error) throw error;

return data;
}

export async function getSongTagSongIdsDB(): Promise<Set<string>> {
const supabase = getClient();

const { data, error } = await supabase.from('song_tags').select('song_id').limit(50000);

if (error) throw error;

return new Set(data.map(row => row.song_id));
}
Comment on lines +102 to +110
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Action required

2. Incomplete tagged-song preload 🐞 Bug ≡ Correctness

getSongTagSongIdsDB() loads only 50,000 rows from song_tags (a many-to-many mapping), so
taggingSongs.ts can miss many already-tagged songs and re-run OpenAI + attempt duplicate inserts for
them.
Agent Prompt
## Issue description
`getSongTagSongIdsDB()` currently fetches only `limit(50000)` rows from `song_tags`. Because `song_tags` has multiple rows per song (1~4 tags per song), this limit does **not** represent “50k songs”; it represents “50k mappings”. As the table grows, many tagged songs will not be loaded into the Set and the tagging cron will re-tag them and attempt duplicate inserts.

## Issue Context
`taggingSongs.ts` uses this Set as the sole mechanism to skip already-tagged songs, so the correctness and completeness of this Set is critical.

## Fix Focus Areas
- packages/crawling/src/supabase/getDB.ts[102-110]
- packages/crawling/src/cron/taggingSongs.ts[11-24]

## Suggested fix
- Implement pagination over `song_tags` using `.range(from, to)` (or another paging strategy) until no rows are returned, adding `song_id` to a `Set`.
- Optionally order by `song_id` to make paging deterministic.
- Consider updating `taggedSongIds` in-memory after a successful insert (`taggedSongIds.add(song.id)`) to avoid any accidental reprocessing within the same run.

ⓘ Copy this prompt and use it to remediate the issue with your preferred AI generation tools

12 changes: 12 additions & 0 deletions packages/crawling/src/supabase/postDB.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,18 @@ export async function postVerifyKySongsDB(song: Song) {
}
}

export async function postSongTagsDB(songId: string, tagIds: number[]) {
const supabase = getClient();
const rows = tagIds.map(tagId => ({ song_id: songId, tag_id: tagId }));

const { error } = await supabase.from('song_tags').insert(rows);
if (error) {
console.error('postSongTagsDB error:', error);
return false;
}
return true;
}

export async function postInvalidKYSongsDB(song: Song) {
const supabase = getClient();

Expand Down
92 changes: 92 additions & 0 deletions packages/crawling/src/utils/getSongTag.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import OpenAI from 'openai';
import dotenv from 'dotenv';

import { getClient } from '@/supabase/getClient';

dotenv.config();

const client = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});

// 태그 정보를 담을 타입 정의
interface Tag {
id: number;
name: string;
category: string;
}

let cachedTagsPrompt: string | null = null;

/**
* DB에서 전체 태그 목록을 읽어와 AI 프롬프트용 텍스트로 변환한다.
*/
const getTagsForPrompt = async (): Promise<string> => {
if (cachedTagsPrompt) return cachedTagsPrompt;

const supabase = getClient();
const { data: tags, error } = await supabase
.from('tags')
.select('id, name, category')
.order('id');

if (error) {
console.error('Error fetching tags:', error);
return '';
}

// AI가 읽기 편하게 "ID: 이름 (카테고리)" 형식으로 변환
cachedTagsPrompt = tags.map((tag: Tag) => `${tag.id}: ${tag.name} (${tag.category})`).join('\n');
return cachedTagsPrompt;
};

/**
* AI를 활용해 노래에 적절한 태그 ID들을 추출한다.
*/
export const autoTagSong = async (title: string, artist: string): Promise<number[]> => {
try {
// 1단계: 프롬프트용 태그 리스트 준비
const tagsPrompt = await getTagsForPrompt();
if (!tagsPrompt) return [];

// 2단계: OpenAI API 호출
const response = await client.chat.completions.create({
model: 'gpt-4o-mini', // 가성비가 좋은 모델 사용
messages: [
{
role: 'system',
content: `
You are a music database expert. Based on the song title and artist, categorize the song by selecting appropriate tag IDs from the provided list.

Guidelines:
1. Select at least one tag, but no more than 4.
2. Prioritize Language (100s), then Genre (200s), then Origin (300s).
3. If it's Japanese music, ALWAYS include 101 (J-POP).
4. Be precise. If it's from an Anime, use 302 (애니메이션).
5. Return only JSON: {"tag_ids": [number, number, ...]}

Allowed Tags List:
${tagsPrompt}
`,
},
{
role: 'user',
content: `Title: "${title}", Artist: "${artist}"`,
},
],
response_format: { type: 'json_object' },
temperature: 0, // 결과의 일관성을 위해 0으로 설정
max_tokens: 50, // 결과가 짧으므로 토큰 제한
});

const content = response.choices[0].message.content;
if (!content) return [];

// 3단계: 결과 파싱 및 반환
const result: { tag_ids: number[] } = JSON.parse(content);
return result.tag_ids;
} catch (error) {
console.error('Error auto-tagging song:', error);
return [];
}
};