Skip to content

Commit ea4b2e3

Browse files
Merge pull request #25 from geraldohomero/dev
Add transcript functionality to toDatabase.py and update database schema for transcripts
2 parents 018a71e + 6533073 commit ea4b2e3

File tree

3 files changed

+75
-14
lines changed

3 files changed

+75
-14
lines changed

CITATION.cff

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,14 @@ type: software
55
authors:
66
- given-names: Geraldo Homero
77
family-names: Couto Neto
8-
orcid: 'https://orcid.org/0000-0001-6686-7182'
9-
repository-code: 'https://github.com/geraldohomero/dh-youtube-database'
8+
orcid: "https://orcid.org/0000-0001-6686-7182"
9+
repository-code: "https://github.com/geraldohomero/dh-youtube-database"
1010
keywords:
1111
- youtube
1212
- database
1313
- digital humanities
1414
- digital history
1515
- data analysis
1616
version: alpha
17-
date-released: '2024-06-19'
17+
date-released: "2024-06-19"
18+
doi: 10.5281/zenodo.15258448

data/transcriptions/transcript.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ def main():
217217
processed_count = 0
218218

219219
# Use a thread pool with max 5 workers
220-
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
220+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
221221
# Submit all tasks to the executor
222222
future_to_video = {
223223
executor.submit(process_video_transcript, video_id): video_id

toDatabase.py

Lines changed: 70 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,25 @@
88
from dataclasses import dataclass
99
from typing import Optional, List, Dict, Any
1010
from config import get_api_key, CHANNEL_IDS, DB_CONFIG, rotate_api_key
11+
import sys
12+
from pathlib import Path
13+
14+
# Add the project root to path to enable imports
15+
project_root = Path(__file__).resolve().parent
16+
sys.path.append(str(project_root))
17+
18+
# Import transcript functionality
19+
try:
20+
from data.transcriptions.transcript import get_transcript, format_transcript_text
21+
except ImportError:
22+
logging.error("Failed to import transcript module. Transcript functionality will be disabled.")
23+
24+
# Create placeholder functions if import fails
25+
def get_transcript(video_id):
26+
return False, "Transcript functionality not available", None
27+
28+
def format_transcript_text(transcript_data):
29+
return "Transcript formatting not available"
1130

1231
# Setup logging
1332
logging.basicConfig(
@@ -108,12 +127,17 @@ def get_video_details(video_id: str, channel_id: str) -> Optional[Dict[str, Any]
108127
published_at = datetime.strptime(video['snippet']['publishedAt'], '%Y-%m-%dT%H:%M:%SZ')
109128
# Determine if comments are enabled (the API returns 'commentCount' only if enabled)
110129
comments_enabled = 'commentCount' in video['statistics']
130+
131+
# Try to get the transcript
132+
success, transcript_text, transcript_lang = get_transcript(video_id)
133+
111134
return {
112135
'videoId': video_id,
113136
'channelId': channel_id,
114137
'videoTitle': video['snippet']['title'],
115138
'videoAudio': None,
116-
'videoTranscript': None,
139+
'videoTranscript': transcript_text if success else None,
140+
'transcriptLanguage': transcript_lang if success else None,
117141
'viewCount': int(video['statistics'].get('viewCount', 0)),
118142
'likeCount': int(video['statistics'].get('likeCount', 0)),
119143
'commentCount': int(video['statistics']['commentCount']) if comments_enabled else 0,
@@ -267,19 +291,30 @@ def save_to_database(conn: sqlite3.Connection, cursor: sqlite3.Cursor,
267291
try:
268292
video_collected_date = video_data['collectedDate'].isoformat()
269293

270-
# Update Videos table
294+
# Check if transcriptLanguage column exists, add it if missing
295+
try:
296+
cursor.execute("SELECT transcriptLanguage FROM Videos WHERE videoId = ? LIMIT 1",
297+
(video_data['videoId'],))
298+
except sqlite3.OperationalError:
299+
logging.info("Adding transcriptLanguage column to Videos table")
300+
cursor.execute("ALTER TABLE Videos ADD COLUMN transcriptLanguage TEXT")
301+
302+
# Update Videos table with transcript information
271303
cursor.execute("""
272304
INSERT INTO Videos (
273305
videoId, channelId, videoTitle, videoAudio, videoTranscript,
274-
viewCount, likeCount, commentCount, publishedAt, collectedDate
306+
viewCount, likeCount, commentCount, publishedAt, collectedDate,
307+
transcriptLanguage
275308
)
276-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
309+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
277310
ON CONFLICT(videoId) DO UPDATE SET
278311
videoTitle = excluded.videoTitle,
279312
viewCount = excluded.viewCount,
280313
likeCount = excluded.likeCount,
281314
commentCount = excluded.commentCount,
282-
collectedDate = excluded.collectedDate
315+
collectedDate = excluded.collectedDate,
316+
videoTranscript = COALESCE(excluded.videoTranscript, videoTranscript),
317+
transcriptLanguage = COALESCE(excluded.transcriptLanguage, transcriptLanguage)
283318
""", (
284319
video_data['videoId'],
285320
video_data['channelId'],
@@ -290,9 +325,10 @@ def save_to_database(conn: sqlite3.Connection, cursor: sqlite3.Cursor,
290325
video_data['likeCount'],
291326
video_data['commentCount'],
292327
video_data['publishedAt'],
293-
video_collected_date
328+
video_collected_date,
329+
video_data.get('transcriptLanguage')
294330
))
295-
331+
296332
# Insert Comments and Replies
297333
for comment in comments:
298334
comment_collected_date = comment['collectedDate'].isoformat()
@@ -341,6 +377,14 @@ def main():
341377
with sqlite3.connect(DB_CONFIG) as conn:
342378
cursor = conn.cursor()
343379
try:
380+
# Add transcriptLanguage column if it doesn't exist
381+
try:
382+
cursor.execute("SELECT transcriptLanguage FROM Videos LIMIT 1")
383+
except sqlite3.OperationalError:
384+
logging.info("Adding transcriptLanguage column to Videos table")
385+
cursor.execute("ALTER TABLE Videos ADD COLUMN transcriptLanguage TEXT")
386+
conn.commit()
387+
344388
for channel_id in CHANNEL_IDS:
345389
details = get_channel_details(channel_id)
346390
if not details:
@@ -363,7 +407,22 @@ def main():
363407
continue
364408

365409
if video_exists_in_database(cursor, video_data['videoId']):
366-
logging.info("Video %s already exists in the database. Skipping...", video_data['videoId'])
410+
logging.info("Video %s already exists in the database. Checking for transcript...", video_data['videoId'])
411+
# Check if we need to update the transcript
412+
cursor.execute("SELECT videoTranscript FROM Videos WHERE videoId = ?", (video_data['videoId'],))
413+
result = cursor.fetchone()
414+
if result and result[0] is None:
415+
# Try to get transcript for existing video that's missing transcript
416+
success, transcript_text, transcript_lang = get_transcript(video_id)
417+
if success:
418+
logging.info("Downloaded transcript for existing video %s", video_id)
419+
cursor.execute(
420+
"UPDATE Videos SET videoTranscript = ?, transcriptLanguage = ? WHERE videoId = ?",
421+
(transcript_text, transcript_lang, video_id)
422+
)
423+
conn.commit()
424+
else:
425+
logging.info("Couldn't download transcript for existing video %s: %s", video_id, transcript_text)
367426
continue
368427

369428
# Check if comments are enabled for the video.
@@ -374,8 +433,9 @@ def main():
374433
comments = get_video_comments(video_id)
375434

376435
if save_to_database(conn, cursor, channel_data, video_data, comments):
377-
logging.info("(%d/%d) Saved data for video %s (%d comments/replies)",
378-
i, total_videos, video_id, len(comments))
436+
transcript_status = "with transcript" if video_data.get('videoTranscript') else "without transcript"
437+
logging.info("(%d/%d) Saved data for video %s (%d comments/replies, %s)",
438+
i, total_videos, video_id, len(comments), transcript_status)
379439
else:
380440
logging.error("Failed to save data for video %s", video_id)
381441

0 commit comments

Comments
 (0)