i2yt/description_extractor.py at main · ByteTrix/i2yt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python3
"""
Description Extractor Module
Handles extraction of video descriptions using yt-dlp
"""

import os
import logging
import subprocess
import json
from typing import Dict, Optional, List
from datetime import datetime
import tempfile
import config

# Setup logging - use the parent logger configuration
logger = logging.getLogger(__name__)

class DescriptionExtractor:
    """Handles extraction of descriptions from Instagram reels using yt-dlp"""

    def __init__(self):
        self.logger = logger
        self.ensure_ytdlp_installed()

    def ensure_ytdlp_installed(self):
        """Ensure yt-dlp is installed"""
        try:
            subprocess.run(['yt-dlp', '--version'], capture_output=True, check=True)
            self.logger.info("yt-dlp is available")
        except (subprocess.CalledProcessError, FileNotFoundError):
            self.logger.info("Installing yt-dlp...")
            subprocess.run(['pip', 'install', 'yt-dlp'], check=True)

    def extract_description(self, url: str) -> Optional[str]:
        """
        Extract description from a single Instagram reel URL

        Args:
            url: Instagram reel URL

        Returns:
            Description text or None if extraction fails
        """
        try:
            self.logger.info(f"Extracting description from: {url}")

            # Use yt-dlp to get description only with browser cookies
            cmd = [
                'yt-dlp',
                '--get-description',
                '--no-warnings'
            ]

            # Use cookies.txt for authentication if it exists
            cookies_file = os.path.join(os.path.dirname(__file__), 'cookies.txt')
            if os.path.exists(cookies_file):
                cmd.extend(['--cookies', cookies_file])
                self.logger.debug(f"Using cookies from {cookies_file} for authentication")
            else:
                self.logger.warning("cookies.txt not found, authentication may fail")

            cmd.append(url)

            result = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=30
            )

            if result.returncode == 0:
                description = result.stdout.strip()
                # Convert multi-line description to single line by replacing newlines with spaces
                description_single_line = ' '.join(description.split())
                self.logger.info(f"Successfully extracted description: {description_single_line[:100]}...")
                return description_single_line
            else:
                self.logger.error(f"Failed to extract description: {result.stderr}")
                return None

        except subprocess.TimeoutExpired:
            self.logger.error(f"Timeout while extracting description from: {url}")
            return None
        except Exception as e:
            self.logger.error(f"Error extracting description from {url}: {str(e)}")
            return None

    def extract_descriptions_batch(self, urls: List[str]) -> Dict[str, Optional[str]]:
        """
        Extract descriptions from multiple URLs

        Args:
            urls: List of Instagram reel URLs

        Returns:
            Dictionary mapping URLs to their descriptions
        """
        results = {}

        for url in urls:
            description = self.extract_description(url)
            results[url] = description

        return results

    def get_video_metadata(self, url: str) -> Optional[Dict]:
        """
        Get full metadata for a video including description, title, etc.

        Args:
            url: Instagram reel URL

        Returns:
            Dictionary with video metadata or None if extraction fails
        """
        try:
            self.logger.info(f"Getting metadata from: {url}")

            cmd = [
                'yt-dlp',
                '--dump-json',
                '--no-warnings'
            ]

            # Use cookies.txt for authentication if it exists
            cookies_file = os.path.join(os.path.dirname(__file__), 'cookies.txt')
            if os.path.exists(cookies_file):
                cmd.extend(['--cookies', cookies_file])
                self.logger.debug(f"Using cookies from {cookies_file} for metadata extraction")
            else:
                self.logger.warning("cookies.txt not found, metadata extraction may fail")

            cmd.append(url)

            result = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=30
            )

            if result.returncode == 0:
                metadata = json.loads(result.stdout)
                self.logger.info(f"Successfully extracted metadata for: {metadata.get('title', 'Unknown')}")
                return metadata
            else:
                self.logger.error(f"Failed to extract metadata: {result.stderr}")
                return None

        except subprocess.TimeoutExpired:
            self.logger.error(f"Timeout while extracting metadata from: {url}")
            return None
        except json.JSONDecodeError as e:
            self.logger.error(f"Failed to parse JSON metadata: {str(e)}")
            return None
        except Exception as e:
            self.logger.error(f"Error extracting metadata from {url}: {str(e)}")
            return None


if __name__ == "__main__":
    # Allow running this module standalone for testing
    logger.info("This module should be used via main_processor.py")
    logger.info("For description extraction, use: python main_processor.py descriptions")