geraldohomero
diff --git a/‎api/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎api/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎api/youtube_client.py‎
Lines changed: 223 additions & 0 deletions b/‎api/youtube_client.py‎
Lines changed: 223 additions & 0 deletions
diff --git a/‎data/transcriptions/transcript.py‎
Lines changed: 42 additions & 2 deletions b/‎data/transcriptions/transcript.py‎
Lines changed: 42 additions & 2 deletions
diff --git a/‎database/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎database/__init__.py‎
Lines changed: 3 additions & 0 deletions
@@ -0,0 +1,3 @@
+from .youtube_client import YouTubeAPIClient
+
+__all__ = ['YouTubeAPIClient']
@@ -0,0 +1,223 @@
+import logging
+import time
+from datetime import datetime
+from typing import Optional, List, Dict, Any
+
+from googleapiclient.discovery import build
+from googleapiclient.errors import HttpError
+
+from config import get_api_key, rotate_api_key
+from models.data_models import ChannelDetails
+
+class YouTubeAPIClient:
+    def __init__(self):
+        self.youtube = build('youtube', 'v3', developerKey=get_api_key())
+    
+    def safe_execute(self, request) -> Dict[str, Any]:
+        """
+        Executes a YouTube API request.
+        Rotates API key and reattempts the request if quota is exceeded.
+        """
+        try:
+            return request.execute()
+        except HttpError as e:
+            error_content = e.content.decode('utf-8')
+            if e.resp.status == 403 and "quotaExceeded" in error_content:
+                logging.info("Quota exceeded. Rotating API key...")
+                new_api_key = rotate_api_key()
+                self.youtube = build('youtube', 'v3', developerKey=new_api_key)
+                return request.execute()
+            else:
+                logging.error("YouTube API error: %s", e)
+                raise
+
+    def get_channel_details(self, channel_id: str) -> Optional[ChannelDetails]:
+        """
+        Fetch channel details from YouTube API by channel ID.
+        Returns ChannelDetails or None on error.
+        """
+        try:
+            request = self.youtube.channels().list(
+                part="snippet,statistics",
+                id=channel_id
+            )
+            response = self.safe_execute(request)
+            items = response.get('items', [])
+            if items:
+                channel = items[0]
+                return ChannelDetails(
+                    channel_id=channel_id,
+                    channel_name=channel['snippet']['title'],
+                    subscriber_count=int(channel['statistics']['subscriberCount'])
+                )
+            else:
+                logging.warning("No channel found with id: %s", channel_id)
+        except Exception as e:
+            logging.error("Error fetching channel details for id %s: %s", channel_id, e)
+        return None
+
+    def get_video_details(self, video_id: str, channel_id: str) -> Optional[Dict[str, Any]]:
+        """
+        Get detailed video information from YouTube API.
+        Returns a dictionary with video details or None on error.
+        Adds a 'commentsEnabled' flag to indicate if comments are available.
+        """
+        try:
+            channel_id = channel_id.strip() if channel_id else channel_id
+            
+            request = self.youtube.videos().list(
+                part="snippet,statistics",
+                id=video_id
+            )
+            response = self.safe_execute(request)
+            if response.get('items'):
+                video = response['items'][0]
+                published_at = datetime.strptime(video['snippet']['publishedAt'], '%Y-%m-%dT%H:%M:%SZ')
+                comments_enabled = 'commentCount' in video['statistics']
+                
+                return {
+                    'videoId': video_id,
+                    'channelId': channel_id,
+                    'videoTitle': video['snippet']['title'],
+                    'videoAudio': None,
+                    'viewCount': int(video['statistics'].get('viewCount', 0)),
+                    'likeCount': int(video['statistics'].get('likeCount', 0)),
+                    'commentCount': int(video['statistics']['commentCount']) if comments_enabled else 0,
+                    'publishedAt': published_at.strftime('%Y-%m-%d %H:%M:%S'),
+                    'commentsEnabled': comments_enabled
+                }
+            else:
+                logging.warning("No video details found for video id: %s", video_id)
+            return None
+        except Exception as e:
+            logging.error("Error fetching video details for video id %s: %s", video_id, e)
+            return None
+
+    def get_video_comments(self, video_id: str) -> List[Dict[str, Any]]:
+        """
+        Fetch video comments (and their replies) from YouTube API.
+        Returns a list of comment dictionaries.
+        If comments are disabled, logs a concise message and returns an empty list.
+        """
+        comments = []
+        next_page_token = None
+        current_date = datetime.now().date()
+
+        try:
+            while True:
+                try:
+                    request = self.youtube.commentThreads().list(
+                        part="snippet,replies",
+                        videoId=video_id,
+                        maxResults=100,
+                        pageToken=next_page_token
+                    )
+                    response = self.safe_execute(request)
+                except HttpError as e:
+                    error_message = e.content.decode("utf-8") if e.content else ""
+                    if e.resp.status == 403 and "commentsDisabled" in error_message:
+                        logging.info("Comments are disabled for video %s. Skipping.", video_id)
+                        return []
+                    else:
+                        logging.error("Error fetching comments for video %s. Skipping.", video_id)
+                        return []
+
+                for item in response.get("items", []):
+                    top_comment = item["snippet"]["topLevelComment"]
+                    comment_id = top_comment["id"]
+                    comment_snippet = top_comment["snippet"]
+                    comment_data = {
+                        "commentId": comment_id,
+                        "videoId": video_id,
+                        "parentCommentId": None,
+                        "userId": comment_snippet["authorChannelId"]["value"],
+                        "userName": comment_snippet["authorDisplayName"],
+                        "content": comment_snippet["textDisplay"],
+                        "likeCount": comment_snippet["likeCount"],
+                        "publishedAt": datetime.strptime(comment_snippet["publishedAt"], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d %H:%M:%S'),
+                        "collectedDate": current_date
+                    }
+                    comments.append(comment_data)
+                    
+                    # Process replies if any
+                    for reply in item.get("replies", {}).get("comments", []):
+                        reply_snippet = reply["snippet"]
+                        comments.append({
+                            "commentId": reply["id"],
+                            "videoId": video_id,
+                            "parentCommentId": comment_id,
+                            "userId": reply_snippet["authorChannelId"]["value"],
+                            "userName": reply_snippet["authorDisplayName"],
+                            "content": reply_snippet["textDisplay"],
+                            "likeCount": reply_snippet["likeCount"],
+                            "publishedAt": datetime.strptime(reply_snippet["publishedAt"], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d %H:%M:%S'),
+                            "collectedDate": current_date
+                        })
+
+                next_page_token = response.get("nextPageToken")
+                if not next_page_token:
+                    break
+
+        except Exception:
+            logging.error("Unexpected error fetching comments for video %s. Skipping.", video_id)
+            return []
+
+        return comments
+
+    def get_channel_videos(self, channel_id: str) -> List[str]:
+        """
+        Get list of all video IDs for a given channel using the uploads playlist.
+        """
+        video_ids = []
+        next_page_token = None
+        
+        try:
+            # uploads playlist ID for this channel
+            channel_request = self.youtube.channels().list(
+                part="contentDetails",
+                id=channel_id
+            )
+            channel_response = self.safe_execute(channel_request)
+            
+            if not channel_response.get('items'):
+                logging.error("Could not find channel with ID: %s", channel_id)
+                return []
+            
+            # Extract the uploads playlist ID
+            uploads_playlist_id = channel_response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
+            logging.info("Found uploads playlist ID: %s for channel: %s", uploads_playlist_id, channel_id)
+            
+            # fetch all videos from this playlist
+            while True:
+                playlist_request = self.youtube.playlistItems().list(
+                    part="snippet",
+                    playlistId=uploads_playlist_id,
+                    maxResults=50,
+                    pageToken=next_page_token
+                )
+                playlist_response = self.safe_execute(playlist_request)
+                
+                for item in playlist_response.get('items', []):
+                    video_ids.append(item['snippet']['resourceId']['videoId'])
+                
+                next_page_token = playlist_response.get('nextPageToken')
+                if not next_page_token:
+                    break
+                
+                # Add a slight delay to respect API quota
+                time.sleep(0.5)
+
+            # Retrieve and log channel name along with video count
+            channel_response = self.youtube.channels().list(
+                part="snippet",
+                id=channel_id
+            ).execute()
+            if channel_response.get("items"):
+                channel_name = channel_response["items"][0]["snippet"]["title"]
+                logging.info("Found %d videos for channel '%s' (ID: %s)", len(video_ids), channel_name, channel_id)
+            else:
+                logging.info("Found %d videos for channel (ID: %s)", len(video_ids), channel_id)
+        except Exception as e:
+            logging.error("Error fetching channel videos for channel %s: %s", channel_id, e)
+        
+        return video_ids
@@ -83,7 +83,47 @@ def get_transcript(video_id: str) -> tuple:
     except TranscriptsDisabled:
         return False, "Transcripts are disabled for this video", None
     except Exception as e:
-        return False, f"Error retrieving transcript: {str(e)}", None
+        # If an error occurs, try one more time unless it's a known non-retriable error
+        error_msg = str(e).lower()
+        if "video is no longer available" in error_msg or "video unavailable" in error_msg:
+            return False, "Video is no longer available", None
+        if "age-restricted" in error_msg or "age restricted" in error_msg:
+            return False, "Video is age restricted", None
+        try:
+            # Retry once
+            transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
+            # Repeat the same logic as above for retry
+            manual_transcript = None
+            for transcript in transcript_list:
+                if not transcript.is_generated:
+                    if transcript.language_code == 'pt-BR' or transcript.language_code == 'pt':
+                        transcript_data = transcript.fetch()
+                        return True, format_transcript_text(transcript_data), transcript.language_code
+                    elif transcript.language_code.startswith('en'):
+                        manual_transcript = (transcript.fetch(), transcript.language_code)
+            if manual_transcript:
+                return True, format_transcript_text(manual_transcript[0]), manual_transcript[1]
+            try:
+                transcript = transcript_list.find_transcript(['pt', 'pt-BR'])
+                transcript_data = transcript.fetch()
+                return True, format_transcript_text(transcript_data), transcript.language_code
+            except:
+                try:
+                    transcript = transcript_list.find_transcript(['en', 'en-US', 'en-GB'])
+                    transcript_data = transcript.fetch()
+                    return True, format_transcript_text(transcript_data), transcript.language_code
+                except:
+                    transcript = transcript_list.find_transcript(['pt', 'en', 'es'])
+                    transcript_data = transcript.fetch()
+                    return True, format_transcript_text(transcript_data), transcript.language_code
+        except Exception as retry_e:
+            retry_msg = str(retry_e).lower()
+            if "video is unavailable" in retry_msg or "video unavailable" in retry_msg:
+                return False, "Video is no longer available", None
+            if "age-restricted" in retry_msg or "age restricted" in retry_msg:
+                return False, "Video is age restricted", None
+            return False, f"Error retrieving transcript after retry: {str(retry_e)}", None
+        return False, f"Error retrieving transcript for {video_id}: {str(e)}", None
 
 def format_transcript_text(transcript_data: list) -> str:
     """Format transcript data into readable text with timestamps."""
@@ -217,7 +257,7 @@ def main():
     processed_count = 0
 
     # Use a thread pool with max 5 workers
-    with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
+    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
         # Submit all tasks to the executor
         future_to_video = {
             executor.submit(process_video_transcript, video_id): video_id 
 
@@ -0,0 +1,3 @@
+from .db_manager import DatabaseManager
+
+__all__ = ['DatabaseManager']
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .youtube_client import YouTubeAPIClient`
	`2`	`+`
	`3`	`+__all__ = ['YouTubeAPIClient']`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .db_manager import DatabaseManager`
	`2`	`+`
	`3`	`+__all__ = ['DatabaseManager']`