update

2025-11-26 22:32:20 +02:00
commit ed94dd22dd
150 changed files with 14058 additions and 0 deletions
--- a/osint/management/init.py
+++ b/osint/management/init.py
@@ -0,0 +1,2 @@
+# Management package
+
--- a/osint/management/commands/init.py
+++ b/osint/management/commands/init.py
@@ -0,0 +1,2 @@
+# Management commands package
+
--- a/osint/management/commands/crawl_osint.py
+++ b/osint/management/commands/crawl_osint.py
@@ -0,0 +1,360 @@
+"""
+Management command for OSINT crawling from seed websites.
+"""
+import re
+import hashlib
+import time
+from urllib.parse import urljoin, urlparse
+from django.core.management.base import BaseCommand
+from django.utils import timezone
+from django.db import transaction, models
+from django.conf import settings
+import requests
+from bs4 import BeautifulSoup
+from osint.models import SeedWebsite, OSINTKeyword, CrawledContent, AutoGeneratedReport
+from reports.models import ScamReport
+
+
+class Command(BaseCommand):
+    help = 'Crawl seed websites and search for scam-related keywords'
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            '--seed-id',
+            type=int,
+            help='Crawl specific seed website by ID',
+        )
+        parser.add_argument(
+            '--all',
+            action='store_true',
+            help='Crawl all active seed websites',
+        )
+        parser.add_argument(
+            '--force',
+            action='store_true',
+            help='Force crawl even if recently crawled',
+        )
+        parser.add_argument(
+            '--max-pages',
+            type=int,
+            default=50,
+            help='Maximum pages to crawl per seed website (default: 50)',
+        )
+        parser.add_argument(
+            '--delay',
+            type=float,
+            default=1.0,
+            help='Delay between requests in seconds (default: 1.0)',
+        )
+
+    def handle(self, *args, **options):
+        self.stdout.write(self.style.SUCCESS('Starting OSINT crawling...'))
+        
+        # Get seed websites to crawl
+        if options['seed_id']:
+            seeds = SeedWebsite.objects.filter(id=options['seed_id'], is_active=True)
+        elif options['all']:
+            seeds = SeedWebsite.objects.filter(is_active=True)
+        else:
+            # Default: crawl websites that are due
+            now = timezone.now()
+            seeds = SeedWebsite.objects.filter(
+                is_active=True
+            ).filter(
+                models.Q(last_crawled_at__isnull=True) |
+                models.Q(last_crawled_at__lt=now - timezone.timedelta(hours=models.F('crawl_interval_hours')))
+            )
+        
+        if not seeds.exists():
+            self.stdout.write(self.style.WARNING('No seed websites to crawl.'))
+            return
+        
+        # Get active keywords
+        keywords = OSINTKeyword.objects.filter(is_active=True)
+        if not keywords.exists():
+            self.stdout.write(self.style.WARNING('No active keywords configured.'))
+            return
+        
+        self.stdout.write(f'Found {seeds.count()} seed website(s) to crawl')
+        self.stdout.write(f'Found {keywords.count()} active keyword(s)')
+        
+        total_pages = 0
+        total_matches = 0
+        
+        for seed in seeds:
+            self.stdout.write(f'\nCrawling: {seed.name} ({seed.url})')
+            pages, matches = self.crawl_seed(seed, keywords, options)
+            total_pages += pages
+            total_matches += matches
+            
+            # Update seed website stats
+            seed.last_crawled_at = timezone.now()
+            seed.pages_crawled += pages
+            seed.matches_found += matches
+            seed.save()
+        
+        self.stdout.write(self.style.SUCCESS(
+            f'\nCrawling completed! Total pages: {total_pages}, Total matches: {total_matches}'
+        ))
+
+    def crawl_seed(self, seed, keywords, options):
+        """Crawl a single seed website."""
+        max_pages = options['max_pages']
+        delay = options['delay']
+        pages_crawled = 0
+        matches_found = 0
+        
+        # Parse base URL
+        parsed_base = urlparse(seed.url)
+        base_domain = f"{parsed_base.scheme}://{parsed_base.netloc}"
+        
+        # Determine allowed domains
+        allowed_domains = seed.allowed_domains if seed.allowed_domains else [parsed_base.netloc]
+        
+        # URLs to visit
+        visited_urls = set()
+        urls_to_visit = [(seed.url, 0)]  # (url, depth)
+        
+        session = requests.Session()
+        session.headers.update({
+            'User-Agent': seed.user_agent or 'Mozilla/5.0 (compatible; OSINTBot/1.0)'
+        })
+        
+        while urls_to_visit and pages_crawled < max_pages:
+            url, depth = urls_to_visit.pop(0)
+            
+            # Skip if already visited or too deep
+            if url in visited_urls or depth > seed.crawl_depth:
+                continue
+            
+            # Check domain
+            parsed = urlparse(url)
+            if parsed.netloc not in allowed_domains:
+                continue
+            
+            visited_urls.add(url)
+            
+            try:
+                # Fetch page
+                self.stdout.write(f'  Fetching: {url} (depth: {depth})')
+                response = session.get(url, timeout=10, allow_redirects=True)
+                response.raise_for_status()
+                
+                # Parse content
+                soup = BeautifulSoup(response.text, 'lxml')
+                
+                # Extract text content
+                # Remove script and style elements
+                for script in soup(["script", "style", "meta", "link"]):
+                    script.decompose()
+                
+                text_content = soup.get_text(separator=' ', strip=True)
+                title = soup.title.string if soup.title else ''
+                html_content = str(soup)
+                
+                # Calculate content hash
+                content_hash = hashlib.sha256(text_content.encode('utf-8')).hexdigest()
+                
+                # Check for duplicates
+                if CrawledContent.objects.filter(url=url, content_hash=content_hash).exists():
+                    self.stdout.write(f'    Skipping duplicate content')
+                    continue
+                
+                # Match keywords
+                matched_keywords = []
+                match_count = 0
+                
+                for keyword_obj in keywords:
+                    matches = self.match_keyword(keyword_obj, text_content, url, title)
+                    if matches:
+                        matched_keywords.append(keyword_obj)
+                        match_count += len(matches)
+                
+                # Calculate confidence score
+                confidence_score = self.calculate_confidence(matched_keywords, match_count)
+                has_potential_scam = confidence_score >= 30  # Threshold
+                
+                # Save crawled content
+                with transaction.atomic():
+                    crawled_content = CrawledContent.objects.create(
+                        seed_website=seed,
+                        url=url,
+                        title=title[:500],
+                        content=text_content[:10000],  # Limit content size
+                        html_content=html_content[:50000],  # Limit HTML size
+                        match_count=match_count,
+                        confidence_score=confidence_score,
+                        has_potential_scam=has_potential_scam,
+                        http_status=response.status_code,
+                        content_hash=content_hash
+                    )
+                    crawled_content.matched_keywords.set(matched_keywords)
+                
+                pages_crawled += 1
+                
+                if has_potential_scam:
+                    matches_found += 1
+                    self.stdout.write(self.style.WARNING(
+                        f'    ⚠ Potential scam detected! Confidence: {confidence_score}%'
+                    ))
+                    
+                    # Create auto-generated report
+                    self.create_auto_report(crawled_content, matched_keywords, confidence_score)
+                
+                # Extract links for further crawling
+                if depth < seed.crawl_depth:
+                    for link in soup.find_all('a', href=True):
+                        href = link['href']
+                        absolute_url = urljoin(url, href)
+                        parsed_link = urlparse(absolute_url)
+                        
+                        # Only follow same-domain links
+                        if parsed_link.netloc in allowed_domains:
+                            if absolute_url not in visited_urls:
+                                urls_to_visit.append((absolute_url, depth + 1))
+                
+                # Rate limiting
+                time.sleep(delay)
+                
+            except requests.RequestException as e:
+                self.stdout.write(self.style.ERROR(f'    Error fetching {url}: {e}'))
+                continue
+            except Exception as e:
+                self.stdout.write(self.style.ERROR(f'    Error processing {url}: {e}'))
+                continue
+        
+        return pages_crawled, matches_found
+
+    def match_keyword(self, keyword_obj, text, url, title):
+        """Match a keyword against text content."""
+        keyword = keyword_obj.keyword
+        flags = 0 if keyword_obj.case_sensitive else re.IGNORECASE
+        
+        matches = []
+        
+        if keyword_obj.keyword_type == 'exact':
+            if keyword_obj.case_sensitive:
+                if keyword in text or keyword in url or keyword in title:
+                    matches.append(keyword)
+            else:
+                if keyword.lower() in text.lower() or keyword.lower() in url.lower() or keyword.lower() in title.lower():
+                    matches.append(keyword)
+        
+        elif keyword_obj.keyword_type == 'regex':
+            try:
+                pattern = re.compile(keyword, flags)
+                matches = pattern.findall(text + ' ' + url + ' ' + title)
+            except re.error:
+                self.stdout.write(self.style.ERROR(f'    Invalid regex: {keyword}'))
+        
+        elif keyword_obj.keyword_type == 'phrase':
+            # Phrase matching (word boundaries)
+            pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', flags)
+            matches = pattern.findall(text + ' ' + url + ' ' + title)
+        
+        elif keyword_obj.keyword_type == 'domain':
+            # Domain pattern matching
+            pattern = re.compile(keyword, flags)
+            matches = pattern.findall(url)
+        
+        elif keyword_obj.keyword_type == 'email':
+            # Email pattern
+            email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', flags)
+            found_emails = email_pattern.findall(text + ' ' + url)
+            # Check if any email matches the keyword pattern
+            pattern = re.compile(keyword, flags)
+            matches = [email for email in found_emails if pattern.search(email)]
+        
+        elif keyword_obj.keyword_type == 'phone':
+            # Phone pattern
+            phone_pattern = re.compile(r'[\+]?[(]?[0-9]{1,4}[)]?[-\s\.]?[(]?[0-9]{1,4}[)]?[-\s\.]?[0-9]{1,9}', flags)
+            found_phones = phone_pattern.findall(text)
+            # Check if any phone matches the keyword pattern
+            pattern = re.compile(keyword, flags)
+            matches = [phone for phone in found_phones if pattern.search(phone)]
+        
+        return matches
+
+    def calculate_confidence(self, matched_keywords, match_count):
+        """Calculate confidence score based on matched keywords."""
+        if not matched_keywords:
+            return 0
+        
+        # Base score from keyword confidence scores
+        base_score = sum(kw.confidence_score for kw in matched_keywords) / len(matched_keywords)
+        
+        # Boost for multiple matches
+        match_boost = min(match_count * 2, 30)  # Max 30 point boost
+        
+        # Boost for multiple different keywords
+        keyword_boost = min(len(matched_keywords) * 5, 20)  # Max 20 point boost
+        
+        total_score = base_score + match_boost + keyword_boost
+        return min(int(total_score), 100)  # Cap at 100
+
+    def create_auto_report(self, crawled_content, matched_keywords, confidence_score):
+        """Create an auto-generated report from crawled content."""
+        # Check if report already exists
+        if AutoGeneratedReport.objects.filter(crawled_content=crawled_content).exists():
+            return
+        
+        # Generate title
+        title = f"Potential Scam Detected: {crawled_content.title or crawled_content.url}"
+        if len(title) > 500:
+            title = title[:497] + '...'
+        
+        # Generate description
+        description = f"Automatically detected potential scam from OSINT crawling.\n\n"
+        description += f"Source URL: {crawled_content.url}\n"
+        description += f"Matched Keywords: {', '.join(kw.name for kw in matched_keywords)}\n"
+        description += f"Confidence Score: {confidence_score}%\n\n"
+        
+        # Extract relevant snippet
+        content_preview = crawled_content.content[:500] + '...' if len(crawled_content.content) > 500 else crawled_content.content
+        description += f"Content Preview:\n{content_preview}"
+        
+        # Determine if should auto-approve
+        status = 'pending'
+        if confidence_score >= 80 and any(kw.auto_approve for kw in matched_keywords):
+            status = 'approved'
+        
+        # Create auto-generated report
+        auto_report = AutoGeneratedReport.objects.create(
+            crawled_content=crawled_content,
+            title=title,
+            description=description,
+            source_url=crawled_content.url,
+            confidence_score=confidence_score,
+            status=status
+        )
+        auto_report.matched_keywords.set(matched_keywords)
+        
+        # If auto-approved, create the actual report
+        if status == 'approved':
+            self.create_scam_report(auto_report)
+
+    def create_scam_report(self, auto_report):
+        """Create actual scam report from auto-generated report."""
+        from reports.models import ScamReport
+        
+        report = ScamReport.objects.create(
+            title=auto_report.title,
+            description=auto_report.description,
+            reported_url=auto_report.source_url,
+            scam_type='other',  # Default type, can be updated by moderator
+            status='verified',  # Auto-verified since reviewed
+            verification_score=auto_report.confidence_score,
+            is_public=True,
+            is_anonymous=True,  # System-generated
+            is_auto_discovered=True,  # Mark as auto-discovered
+        )
+        
+        auto_report.report = report
+        auto_report.status = 'published'
+        auto_report.published_at = timezone.now()
+        auto_report.save()
+        
+        self.stdout.write(self.style.SUCCESS(
+            f'    ✓ Auto-approved and published report: {report.title}'
+        ))
+