OSINT/osint/management/commands/crawl_osint.py

"""
Management command for OSINT crawling from seed websites.
"""
import re
import hashlib
import time
from urllib.parse import urljoin, urlparse
from django.core.management.base import BaseCommand
from django.utils import timezone
from django.db import transaction, models
from django.conf import settings
import requests
from bs4 import BeautifulSoup
from osint.models import SeedWebsite, OSINTKeyword, CrawledContent, AutoGeneratedReport
from reports.models import ScamReport


class Command(BaseCommand):
    help = 'Crawl seed websites and search for scam-related keywords'

    def add_arguments(self, parser):
        parser.add_argument(
            '--seed-id',
            type=int,
            help='Crawl specific seed website by ID',
        )
        parser.add_argument(
            '--all',
            action='store_true',
            help='Crawl all active seed websites',
        )
        parser.add_argument(
            '--force',
            action='store_true',
            help='Force crawl even if recently crawled',
        )
        parser.add_argument(
            '--max-pages',
            type=int,
            default=50,
            help='Maximum pages to crawl per seed website (default: 50)',
        )
        parser.add_argument(
            '--delay',
            type=float,
            default=1.0,
            help='Delay between requests in seconds (default: 1.0)',
        )

    def handle(self, *args, **options):
        self.stdout.write(self.style.SUCCESS('Starting OSINT crawling...'))

        # Get seed websites to crawl
        if options['seed_id']:
            seeds = SeedWebsite.objects.filter(id=options['seed_id'], is_active=True)
        elif options['all']:
            seeds = SeedWebsite.objects.filter(is_active=True)
        else:
            # Default: crawl websites that are due
            now = timezone.now()
            seeds = SeedWebsite.objects.filter(
                is_active=True
            ).filter(
                models.Q(last_crawled_at__isnull=True) |
                models.Q(last_crawled_at__lt=now - timezone.timedelta(hours=models.F('crawl_interval_hours')))
            )

        if not seeds.exists():
            self.stdout.write(self.style.WARNING('No seed websites to crawl.'))
            return

        # Get active keywords
        keywords = OSINTKeyword.objects.filter(is_active=True)
        if not keywords.exists():
            self.stdout.write(self.style.WARNING('No active keywords configured.'))
            return

        self.stdout.write(f'Found {seeds.count()} seed website(s) to crawl')
        self.stdout.write(f'Found {keywords.count()} active keyword(s)')

        total_pages = 0
        total_matches = 0

        for seed in seeds:
            self.stdout.write(f'\nCrawling: {seed.name} ({seed.url})')
            pages, matches = self.crawl_seed(seed, keywords, options)
            total_pages += pages
            total_matches += matches

            # Update seed website stats
            seed.last_crawled_at = timezone.now()
            seed.pages_crawled += pages
            seed.matches_found += matches
            seed.save()

        self.stdout.write(self.style.SUCCESS(
            f'\nCrawling completed! Total pages: {total_pages}, Total matches: {total_matches}'
        ))

    def crawl_seed(self, seed, keywords, options):
        """Crawl a single seed website."""
        max_pages = options['max_pages']
        delay = options['delay']
        pages_crawled = 0
        matches_found = 0

        # Parse base URL
        parsed_base = urlparse(seed.url)
        base_domain = f"{parsed_base.scheme}://{parsed_base.netloc}"

        # Determine allowed domains
        allowed_domains = seed.allowed_domains if seed.allowed_domains else [parsed_base.netloc]

        # URLs to visit
        visited_urls = set()
        urls_to_visit = [(seed.url, 0)]  # (url, depth)

        session = requests.Session()
        session.headers.update({
            'User-Agent': seed.user_agent or 'Mozilla/5.0 (compatible; OSINTBot/1.0)'
        })

        while urls_to_visit and pages_crawled < max_pages:
            url, depth = urls_to_visit.pop(0)

            # Skip if already visited or too deep
            if url in visited_urls or depth > seed.crawl_depth:
                continue

            # Check domain
            parsed = urlparse(url)
            if parsed.netloc not in allowed_domains:
                continue

            visited_urls.add(url)

            try:
                # Fetch page
                self.stdout.write(f'  Fetching: {url} (depth: {depth})')
                response = session.get(url, timeout=10, allow_redirects=True)
                response.raise_for_status()

                # Parse content
                soup = BeautifulSoup(response.text, 'lxml')

                # Extract text content
                # Remove script and style elements
                for script in soup(["script", "style", "meta", "link"]):
                    script.decompose()

                text_content = soup.get_text(separator=' ', strip=True)
                title = soup.title.string if soup.title else ''
                html_content = str(soup)

                # Calculate content hash
                content_hash = hashlib.sha256(text_content.encode('utf-8')).hexdigest()

                # Check for duplicates
                if CrawledContent.objects.filter(url=url, content_hash=content_hash).exists():
                    self.stdout.write(f'    Skipping duplicate content')
                    continue

                # Match keywords
                matched_keywords = []
                match_count = 0

                for keyword_obj in keywords:
                    matches = self.match_keyword(keyword_obj, text_content, url, title)
                    if matches:
                        matched_keywords.append(keyword_obj)
                        match_count += len(matches)

                # Calculate confidence score
                confidence_score = self.calculate_confidence(matched_keywords, match_count)
                has_potential_scam = confidence_score >= 30  # Threshold

                # Save crawled content
                with transaction.atomic():
                    crawled_content = CrawledContent.objects.create(
                        seed_website=seed,
                        url=url,
                        title=title[:500],
                        content=text_content[:10000],  # Limit content size
                        html_content=html_content[:50000],  # Limit HTML size
                        match_count=match_count,
                        confidence_score=confidence_score,
                        has_potential_scam=has_potential_scam,
                        http_status=response.status_code,
                        content_hash=content_hash
                    )
                    crawled_content.matched_keywords.set(matched_keywords)

                pages_crawled += 1

                if has_potential_scam:
                    matches_found += 1
                    self.stdout.write(self.style.WARNING(
                        f'    ⚠ Potential scam detected! Confidence: {confidence_score}%'
                    ))

                    # Create auto-generated report
                    self.create_auto_report(crawled_content, matched_keywords, confidence_score)

                # Extract links for further crawling
                if depth < seed.crawl_depth:
                    for link in soup.find_all('a', href=True):
                        href = link['href']
                        absolute_url = urljoin(url, href)
                        parsed_link = urlparse(absolute_url)

                        # Only follow same-domain links
                        if parsed_link.netloc in allowed_domains:
                            if absolute_url not in visited_urls:
                                urls_to_visit.append((absolute_url, depth + 1))

                # Rate limiting
                time.sleep(delay)

            except requests.RequestException as e:
                self.stdout.write(self.style.ERROR(f'    Error fetching {url}: {e}'))
                continue
            except Exception as e:
                self.stdout.write(self.style.ERROR(f'    Error processing {url}: {e}'))
                continue

        return pages_crawled, matches_found

    def match_keyword(self, keyword_obj, text, url, title):
        """Match a keyword against text content."""
        keyword = keyword_obj.keyword
        flags = 0 if keyword_obj.case_sensitive else re.IGNORECASE

        matches = []

        if keyword_obj.keyword_type == 'exact':
            if keyword_obj.case_sensitive:
                if keyword in text or keyword in url or keyword in title:
                    matches.append(keyword)
            else:
                if keyword.lower() in text.lower() or keyword.lower() in url.lower() or keyword.lower() in title.lower():
                    matches.append(keyword)

        elif keyword_obj.keyword_type == 'regex':
            try:
                pattern = re.compile(keyword, flags)
                matches = pattern.findall(text + ' ' + url + ' ' + title)
            except re.error:
                self.stdout.write(self.style.ERROR(f'    Invalid regex: {keyword}'))

        elif keyword_obj.keyword_type == 'phrase':
            # Phrase matching (word boundaries)
            pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', flags)
            matches = pattern.findall(text + ' ' + url + ' ' + title)

        elif keyword_obj.keyword_type == 'domain':
            # Domain pattern matching
            pattern = re.compile(keyword, flags)
            matches = pattern.findall(url)

        elif keyword_obj.keyword_type == 'email':
            # Email pattern
            email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', flags)
            found_emails = email_pattern.findall(text + ' ' + url)
            # Check if any email matches the keyword pattern
            pattern = re.compile(keyword, flags)
            matches = [email for email in found_emails if pattern.search(email)]

        elif keyword_obj.keyword_type == 'phone':
            # Phone pattern
            phone_pattern = re.compile(r'[\+]?[(]?[0-9]{1,4}[)]?[-\s\.]?[(]?[0-9]{1,4}[)]?[-\s\.]?[0-9]{1,9}', flags)
            found_phones = phone_pattern.findall(text)
            # Check if any phone matches the keyword pattern
            pattern = re.compile(keyword, flags)
            matches = [phone for phone in found_phones if pattern.search(phone)]

        return matches

    def calculate_confidence(self, matched_keywords, match_count):
        """Calculate confidence score based on matched keywords."""
        if not matched_keywords:
            return 0

        # Base score from keyword confidence scores
        base_score = sum(kw.confidence_score for kw in matched_keywords) / len(matched_keywords)

        # Boost for multiple matches
        match_boost = min(match_count * 2, 30)  # Max 30 point boost

        # Boost for multiple different keywords
        keyword_boost = min(len(matched_keywords) * 5, 20)  # Max 20 point boost

        total_score = base_score + match_boost + keyword_boost
        return min(int(total_score), 100)  # Cap at 100

    def create_auto_report(self, crawled_content, matched_keywords, confidence_score):
        """Create an auto-generated report from crawled content."""
        # Check if report already exists
        if AutoGeneratedReport.objects.filter(crawled_content=crawled_content).exists():
            return

        # Generate title
        title = f"Potential Scam Detected: {crawled_content.title or crawled_content.url}"
        if len(title) > 500:
            title = title[:497] + '...'

        # Generate description
        description = f"Automatically detected potential scam from OSINT crawling.\n\n"
        description += f"Source URL: {crawled_content.url}\n"
        description += f"Matched Keywords: {', '.join(kw.name for kw in matched_keywords)}\n"
        description += f"Confidence Score: {confidence_score}%\n\n"

        # Extract relevant snippet
        content_preview = crawled_content.content[:500] + '...' if len(crawled_content.content) > 500 else crawled_content.content
        description += f"Content Preview:\n{content_preview}"

        # Determine if should auto-approve
        status = 'pending'
        if confidence_score >= 80 and any(kw.auto_approve for kw in matched_keywords):
            status = 'approved'

        # Create auto-generated report
        auto_report = AutoGeneratedReport.objects.create(
            crawled_content=crawled_content,
            title=title,
            description=description,
            source_url=crawled_content.url,
            confidence_score=confidence_score,
            status=status
        )
        auto_report.matched_keywords.set(matched_keywords)

        # If auto-approved, create the actual report
        if status == 'approved':
            self.create_scam_report(auto_report)

    def create_scam_report(self, auto_report):
        """Create actual scam report from auto-generated report."""
        from reports.models import ScamReport

        report = ScamReport.objects.create(
            title=auto_report.title,
            description=auto_report.description,
            reported_url=auto_report.source_url,
            scam_type='other',  # Default type, can be updated by moderator
            status='verified',  # Auto-verified since reviewed
            verification_score=auto_report.confidence_score,
            is_public=True,
            is_anonymous=True,  # System-generated
            is_auto_discovered=True,  # Mark as auto-discovered
        )

        auto_report.report = report
        auto_report.status = 'published'
        auto_report.published_at = timezone.now()
        auto_report.save()

        self.stdout.write(self.style.SUCCESS(
            f'    ✓ Auto-approved and published report: {report.title}'
        ))