88from dataclasses import dataclass
99from typing import Optional , List , Dict , Any
1010from config import get_api_key , CHANNEL_IDS , DB_CONFIG , rotate_api_key
11+ import sys
12+ from pathlib import Path
13+
14+ # Add the project root to path to enable imports
15+ project_root = Path (__file__ ).resolve ().parent
16+ sys .path .append (str (project_root ))
17+
18+ # Import transcript functionality
19+ try :
20+ from data .transcriptions .transcript import get_transcript , format_transcript_text
21+ except ImportError :
22+ logging .error ("Failed to import transcript module. Transcript functionality will be disabled." )
23+
24+ # Create placeholder functions if import fails
25+ def get_transcript (video_id ):
26+ return False , "Transcript functionality not available" , None
27+
28+ def format_transcript_text (transcript_data ):
29+ return "Transcript formatting not available"
1130
1231# Setup logging
1332logging .basicConfig (
@@ -108,12 +127,17 @@ def get_video_details(video_id: str, channel_id: str) -> Optional[Dict[str, Any]
108127 published_at = datetime .strptime (video ['snippet' ]['publishedAt' ], '%Y-%m-%dT%H:%M:%SZ' )
109128 # Determine if comments are enabled (the API returns 'commentCount' only if enabled)
110129 comments_enabled = 'commentCount' in video ['statistics' ]
130+
131+ # Try to get the transcript
132+ success , transcript_text , transcript_lang = get_transcript (video_id )
133+
111134 return {
112135 'videoId' : video_id ,
113136 'channelId' : channel_id ,
114137 'videoTitle' : video ['snippet' ]['title' ],
115138 'videoAudio' : None ,
116- 'videoTranscript' : None ,
139+ 'videoTranscript' : transcript_text if success else None ,
140+ 'transcriptLanguage' : transcript_lang if success else None ,
117141 'viewCount' : int (video ['statistics' ].get ('viewCount' , 0 )),
118142 'likeCount' : int (video ['statistics' ].get ('likeCount' , 0 )),
119143 'commentCount' : int (video ['statistics' ]['commentCount' ]) if comments_enabled else 0 ,
@@ -267,19 +291,30 @@ def save_to_database(conn: sqlite3.Connection, cursor: sqlite3.Cursor,
267291 try :
268292 video_collected_date = video_data ['collectedDate' ].isoformat ()
269293
270- # Update Videos table
294+ # Check if transcriptLanguage column exists, add it if missing
295+ try :
296+ cursor .execute ("SELECT transcriptLanguage FROM Videos WHERE videoId = ? LIMIT 1" ,
297+ (video_data ['videoId' ],))
298+ except sqlite3 .OperationalError :
299+ logging .info ("Adding transcriptLanguage column to Videos table" )
300+ cursor .execute ("ALTER TABLE Videos ADD COLUMN transcriptLanguage TEXT" )
301+
302+ # Update Videos table with transcript information
271303 cursor .execute ("""
272304 INSERT INTO Videos (
273305 videoId, channelId, videoTitle, videoAudio, videoTranscript,
274- viewCount, likeCount, commentCount, publishedAt, collectedDate
306+ viewCount, likeCount, commentCount, publishedAt, collectedDate,
307+ transcriptLanguage
275308 )
276- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
309+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )
277310 ON CONFLICT(videoId) DO UPDATE SET
278311 videoTitle = excluded.videoTitle,
279312 viewCount = excluded.viewCount,
280313 likeCount = excluded.likeCount,
281314 commentCount = excluded.commentCount,
282- collectedDate = excluded.collectedDate
315+ collectedDate = excluded.collectedDate,
316+ videoTranscript = COALESCE(excluded.videoTranscript, videoTranscript),
317+ transcriptLanguage = COALESCE(excluded.transcriptLanguage, transcriptLanguage)
283318 """ , (
284319 video_data ['videoId' ],
285320 video_data ['channelId' ],
@@ -290,9 +325,10 @@ def save_to_database(conn: sqlite3.Connection, cursor: sqlite3.Cursor,
290325 video_data ['likeCount' ],
291326 video_data ['commentCount' ],
292327 video_data ['publishedAt' ],
293- video_collected_date
328+ video_collected_date ,
329+ video_data .get ('transcriptLanguage' )
294330 ))
295-
331+
296332 # Insert Comments and Replies
297333 for comment in comments :
298334 comment_collected_date = comment ['collectedDate' ].isoformat ()
@@ -341,6 +377,14 @@ def main():
341377 with sqlite3 .connect (DB_CONFIG ) as conn :
342378 cursor = conn .cursor ()
343379 try :
380+ # Add transcriptLanguage column if it doesn't exist
381+ try :
382+ cursor .execute ("SELECT transcriptLanguage FROM Videos LIMIT 1" )
383+ except sqlite3 .OperationalError :
384+ logging .info ("Adding transcriptLanguage column to Videos table" )
385+ cursor .execute ("ALTER TABLE Videos ADD COLUMN transcriptLanguage TEXT" )
386+ conn .commit ()
387+
344388 for channel_id in CHANNEL_IDS :
345389 details = get_channel_details (channel_id )
346390 if not details :
@@ -363,7 +407,22 @@ def main():
363407 continue
364408
365409 if video_exists_in_database (cursor , video_data ['videoId' ]):
366- logging .info ("Video %s already exists in the database. Skipping..." , video_data ['videoId' ])
410+ logging .info ("Video %s already exists in the database. Checking for transcript..." , video_data ['videoId' ])
411+ # Check if we need to update the transcript
412+ cursor .execute ("SELECT videoTranscript FROM Videos WHERE videoId = ?" , (video_data ['videoId' ],))
413+ result = cursor .fetchone ()
414+ if result and result [0 ] is None :
415+ # Try to get transcript for existing video that's missing transcript
416+ success , transcript_text , transcript_lang = get_transcript (video_id )
417+ if success :
418+ logging .info ("Downloaded transcript for existing video %s" , video_id )
419+ cursor .execute (
420+ "UPDATE Videos SET videoTranscript = ?, transcriptLanguage = ? WHERE videoId = ?" ,
421+ (transcript_text , transcript_lang , video_id )
422+ )
423+ conn .commit ()
424+ else :
425+ logging .info ("Couldn't download transcript for existing video %s: %s" , video_id , transcript_text )
367426 continue
368427
369428 # Check if comments are enabled for the video.
@@ -374,8 +433,9 @@ def main():
374433 comments = get_video_comments (video_id )
375434
376435 if save_to_database (conn , cursor , channel_data , video_data , comments ):
377- logging .info ("(%d/%d) Saved data for video %s (%d comments/replies)" ,
378- i , total_videos , video_id , len (comments ))
436+ transcript_status = "with transcript" if video_data .get ('videoTranscript' ) else "without transcript"
437+ logging .info ("(%d/%d) Saved data for video %s (%d comments/replies, %s)" ,
438+ i , total_videos , video_id , len (comments ), transcript_status )
379439 else :
380440 logging .error ("Failed to save data for video %s" , video_id )
381441
0 commit comments