update
This commit is contained in:
2
osint/management/__init__.py
Normal file
2
osint/management/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# Management package
|
||||
|
||||
2
osint/management/commands/__init__.py
Normal file
2
osint/management/commands/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# Management commands package
|
||||
|
||||
360
osint/management/commands/crawl_osint.py
Normal file
360
osint/management/commands/crawl_osint.py
Normal file
@@ -0,0 +1,360 @@
|
||||
"""
|
||||
Management command for OSINT crawling from seed websites.
|
||||
"""
|
||||
import re
|
||||
import hashlib
|
||||
import time
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.utils import timezone
|
||||
from django.db import transaction, models
|
||||
from django.conf import settings
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from osint.models import SeedWebsite, OSINTKeyword, CrawledContent, AutoGeneratedReport
|
||||
from reports.models import ScamReport
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = 'Crawl seed websites and search for scam-related keywords'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
'--seed-id',
|
||||
type=int,
|
||||
help='Crawl specific seed website by ID',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--all',
|
||||
action='store_true',
|
||||
help='Crawl all active seed websites',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--force',
|
||||
action='store_true',
|
||||
help='Force crawl even if recently crawled',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--max-pages',
|
||||
type=int,
|
||||
default=50,
|
||||
help='Maximum pages to crawl per seed website (default: 50)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--delay',
|
||||
type=float,
|
||||
default=1.0,
|
||||
help='Delay between requests in seconds (default: 1.0)',
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
self.stdout.write(self.style.SUCCESS('Starting OSINT crawling...'))
|
||||
|
||||
# Get seed websites to crawl
|
||||
if options['seed_id']:
|
||||
seeds = SeedWebsite.objects.filter(id=options['seed_id'], is_active=True)
|
||||
elif options['all']:
|
||||
seeds = SeedWebsite.objects.filter(is_active=True)
|
||||
else:
|
||||
# Default: crawl websites that are due
|
||||
now = timezone.now()
|
||||
seeds = SeedWebsite.objects.filter(
|
||||
is_active=True
|
||||
).filter(
|
||||
models.Q(last_crawled_at__isnull=True) |
|
||||
models.Q(last_crawled_at__lt=now - timezone.timedelta(hours=models.F('crawl_interval_hours')))
|
||||
)
|
||||
|
||||
if not seeds.exists():
|
||||
self.stdout.write(self.style.WARNING('No seed websites to crawl.'))
|
||||
return
|
||||
|
||||
# Get active keywords
|
||||
keywords = OSINTKeyword.objects.filter(is_active=True)
|
||||
if not keywords.exists():
|
||||
self.stdout.write(self.style.WARNING('No active keywords configured.'))
|
||||
return
|
||||
|
||||
self.stdout.write(f'Found {seeds.count()} seed website(s) to crawl')
|
||||
self.stdout.write(f'Found {keywords.count()} active keyword(s)')
|
||||
|
||||
total_pages = 0
|
||||
total_matches = 0
|
||||
|
||||
for seed in seeds:
|
||||
self.stdout.write(f'\nCrawling: {seed.name} ({seed.url})')
|
||||
pages, matches = self.crawl_seed(seed, keywords, options)
|
||||
total_pages += pages
|
||||
total_matches += matches
|
||||
|
||||
# Update seed website stats
|
||||
seed.last_crawled_at = timezone.now()
|
||||
seed.pages_crawled += pages
|
||||
seed.matches_found += matches
|
||||
seed.save()
|
||||
|
||||
self.stdout.write(self.style.SUCCESS(
|
||||
f'\nCrawling completed! Total pages: {total_pages}, Total matches: {total_matches}'
|
||||
))
|
||||
|
||||
def crawl_seed(self, seed, keywords, options):
|
||||
"""Crawl a single seed website."""
|
||||
max_pages = options['max_pages']
|
||||
delay = options['delay']
|
||||
pages_crawled = 0
|
||||
matches_found = 0
|
||||
|
||||
# Parse base URL
|
||||
parsed_base = urlparse(seed.url)
|
||||
base_domain = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
||||
|
||||
# Determine allowed domains
|
||||
allowed_domains = seed.allowed_domains if seed.allowed_domains else [parsed_base.netloc]
|
||||
|
||||
# URLs to visit
|
||||
visited_urls = set()
|
||||
urls_to_visit = [(seed.url, 0)] # (url, depth)
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update({
|
||||
'User-Agent': seed.user_agent or 'Mozilla/5.0 (compatible; OSINTBot/1.0)'
|
||||
})
|
||||
|
||||
while urls_to_visit and pages_crawled < max_pages:
|
||||
url, depth = urls_to_visit.pop(0)
|
||||
|
||||
# Skip if already visited or too deep
|
||||
if url in visited_urls or depth > seed.crawl_depth:
|
||||
continue
|
||||
|
||||
# Check domain
|
||||
parsed = urlparse(url)
|
||||
if parsed.netloc not in allowed_domains:
|
||||
continue
|
||||
|
||||
visited_urls.add(url)
|
||||
|
||||
try:
|
||||
# Fetch page
|
||||
self.stdout.write(f' Fetching: {url} (depth: {depth})')
|
||||
response = session.get(url, timeout=10, allow_redirects=True)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse content
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
|
||||
# Extract text content
|
||||
# Remove script and style elements
|
||||
for script in soup(["script", "style", "meta", "link"]):
|
||||
script.decompose()
|
||||
|
||||
text_content = soup.get_text(separator=' ', strip=True)
|
||||
title = soup.title.string if soup.title else ''
|
||||
html_content = str(soup)
|
||||
|
||||
# Calculate content hash
|
||||
content_hash = hashlib.sha256(text_content.encode('utf-8')).hexdigest()
|
||||
|
||||
# Check for duplicates
|
||||
if CrawledContent.objects.filter(url=url, content_hash=content_hash).exists():
|
||||
self.stdout.write(f' Skipping duplicate content')
|
||||
continue
|
||||
|
||||
# Match keywords
|
||||
matched_keywords = []
|
||||
match_count = 0
|
||||
|
||||
for keyword_obj in keywords:
|
||||
matches = self.match_keyword(keyword_obj, text_content, url, title)
|
||||
if matches:
|
||||
matched_keywords.append(keyword_obj)
|
||||
match_count += len(matches)
|
||||
|
||||
# Calculate confidence score
|
||||
confidence_score = self.calculate_confidence(matched_keywords, match_count)
|
||||
has_potential_scam = confidence_score >= 30 # Threshold
|
||||
|
||||
# Save crawled content
|
||||
with transaction.atomic():
|
||||
crawled_content = CrawledContent.objects.create(
|
||||
seed_website=seed,
|
||||
url=url,
|
||||
title=title[:500],
|
||||
content=text_content[:10000], # Limit content size
|
||||
html_content=html_content[:50000], # Limit HTML size
|
||||
match_count=match_count,
|
||||
confidence_score=confidence_score,
|
||||
has_potential_scam=has_potential_scam,
|
||||
http_status=response.status_code,
|
||||
content_hash=content_hash
|
||||
)
|
||||
crawled_content.matched_keywords.set(matched_keywords)
|
||||
|
||||
pages_crawled += 1
|
||||
|
||||
if has_potential_scam:
|
||||
matches_found += 1
|
||||
self.stdout.write(self.style.WARNING(
|
||||
f' ⚠ Potential scam detected! Confidence: {confidence_score}%'
|
||||
))
|
||||
|
||||
# Create auto-generated report
|
||||
self.create_auto_report(crawled_content, matched_keywords, confidence_score)
|
||||
|
||||
# Extract links for further crawling
|
||||
if depth < seed.crawl_depth:
|
||||
for link in soup.find_all('a', href=True):
|
||||
href = link['href']
|
||||
absolute_url = urljoin(url, href)
|
||||
parsed_link = urlparse(absolute_url)
|
||||
|
||||
# Only follow same-domain links
|
||||
if parsed_link.netloc in allowed_domains:
|
||||
if absolute_url not in visited_urls:
|
||||
urls_to_visit.append((absolute_url, depth + 1))
|
||||
|
||||
# Rate limiting
|
||||
time.sleep(delay)
|
||||
|
||||
except requests.RequestException as e:
|
||||
self.stdout.write(self.style.ERROR(f' Error fetching {url}: {e}'))
|
||||
continue
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f' Error processing {url}: {e}'))
|
||||
continue
|
||||
|
||||
return pages_crawled, matches_found
|
||||
|
||||
def match_keyword(self, keyword_obj, text, url, title):
|
||||
"""Match a keyword against text content."""
|
||||
keyword = keyword_obj.keyword
|
||||
flags = 0 if keyword_obj.case_sensitive else re.IGNORECASE
|
||||
|
||||
matches = []
|
||||
|
||||
if keyword_obj.keyword_type == 'exact':
|
||||
if keyword_obj.case_sensitive:
|
||||
if keyword in text or keyword in url or keyword in title:
|
||||
matches.append(keyword)
|
||||
else:
|
||||
if keyword.lower() in text.lower() or keyword.lower() in url.lower() or keyword.lower() in title.lower():
|
||||
matches.append(keyword)
|
||||
|
||||
elif keyword_obj.keyword_type == 'regex':
|
||||
try:
|
||||
pattern = re.compile(keyword, flags)
|
||||
matches = pattern.findall(text + ' ' + url + ' ' + title)
|
||||
except re.error:
|
||||
self.stdout.write(self.style.ERROR(f' Invalid regex: {keyword}'))
|
||||
|
||||
elif keyword_obj.keyword_type == 'phrase':
|
||||
# Phrase matching (word boundaries)
|
||||
pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', flags)
|
||||
matches = pattern.findall(text + ' ' + url + ' ' + title)
|
||||
|
||||
elif keyword_obj.keyword_type == 'domain':
|
||||
# Domain pattern matching
|
||||
pattern = re.compile(keyword, flags)
|
||||
matches = pattern.findall(url)
|
||||
|
||||
elif keyword_obj.keyword_type == 'email':
|
||||
# Email pattern
|
||||
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', flags)
|
||||
found_emails = email_pattern.findall(text + ' ' + url)
|
||||
# Check if any email matches the keyword pattern
|
||||
pattern = re.compile(keyword, flags)
|
||||
matches = [email for email in found_emails if pattern.search(email)]
|
||||
|
||||
elif keyword_obj.keyword_type == 'phone':
|
||||
# Phone pattern
|
||||
phone_pattern = re.compile(r'[\+]?[(]?[0-9]{1,4}[)]?[-\s\.]?[(]?[0-9]{1,4}[)]?[-\s\.]?[0-9]{1,9}', flags)
|
||||
found_phones = phone_pattern.findall(text)
|
||||
# Check if any phone matches the keyword pattern
|
||||
pattern = re.compile(keyword, flags)
|
||||
matches = [phone for phone in found_phones if pattern.search(phone)]
|
||||
|
||||
return matches
|
||||
|
||||
def calculate_confidence(self, matched_keywords, match_count):
|
||||
"""Calculate confidence score based on matched keywords."""
|
||||
if not matched_keywords:
|
||||
return 0
|
||||
|
||||
# Base score from keyword confidence scores
|
||||
base_score = sum(kw.confidence_score for kw in matched_keywords) / len(matched_keywords)
|
||||
|
||||
# Boost for multiple matches
|
||||
match_boost = min(match_count * 2, 30) # Max 30 point boost
|
||||
|
||||
# Boost for multiple different keywords
|
||||
keyword_boost = min(len(matched_keywords) * 5, 20) # Max 20 point boost
|
||||
|
||||
total_score = base_score + match_boost + keyword_boost
|
||||
return min(int(total_score), 100) # Cap at 100
|
||||
|
||||
def create_auto_report(self, crawled_content, matched_keywords, confidence_score):
|
||||
"""Create an auto-generated report from crawled content."""
|
||||
# Check if report already exists
|
||||
if AutoGeneratedReport.objects.filter(crawled_content=crawled_content).exists():
|
||||
return
|
||||
|
||||
# Generate title
|
||||
title = f"Potential Scam Detected: {crawled_content.title or crawled_content.url}"
|
||||
if len(title) > 500:
|
||||
title = title[:497] + '...'
|
||||
|
||||
# Generate description
|
||||
description = f"Automatically detected potential scam from OSINT crawling.\n\n"
|
||||
description += f"Source URL: {crawled_content.url}\n"
|
||||
description += f"Matched Keywords: {', '.join(kw.name for kw in matched_keywords)}\n"
|
||||
description += f"Confidence Score: {confidence_score}%\n\n"
|
||||
|
||||
# Extract relevant snippet
|
||||
content_preview = crawled_content.content[:500] + '...' if len(crawled_content.content) > 500 else crawled_content.content
|
||||
description += f"Content Preview:\n{content_preview}"
|
||||
|
||||
# Determine if should auto-approve
|
||||
status = 'pending'
|
||||
if confidence_score >= 80 and any(kw.auto_approve for kw in matched_keywords):
|
||||
status = 'approved'
|
||||
|
||||
# Create auto-generated report
|
||||
auto_report = AutoGeneratedReport.objects.create(
|
||||
crawled_content=crawled_content,
|
||||
title=title,
|
||||
description=description,
|
||||
source_url=crawled_content.url,
|
||||
confidence_score=confidence_score,
|
||||
status=status
|
||||
)
|
||||
auto_report.matched_keywords.set(matched_keywords)
|
||||
|
||||
# If auto-approved, create the actual report
|
||||
if status == 'approved':
|
||||
self.create_scam_report(auto_report)
|
||||
|
||||
def create_scam_report(self, auto_report):
|
||||
"""Create actual scam report from auto-generated report."""
|
||||
from reports.models import ScamReport
|
||||
|
||||
report = ScamReport.objects.create(
|
||||
title=auto_report.title,
|
||||
description=auto_report.description,
|
||||
reported_url=auto_report.source_url,
|
||||
scam_type='other', # Default type, can be updated by moderator
|
||||
status='verified', # Auto-verified since reviewed
|
||||
verification_score=auto_report.confidence_score,
|
||||
is_public=True,
|
||||
is_anonymous=True, # System-generated
|
||||
is_auto_discovered=True, # Mark as auto-discovered
|
||||
)
|
||||
|
||||
auto_report.report = report
|
||||
auto_report.status = 'published'
|
||||
auto_report.published_at = timezone.now()
|
||||
auto_report.save()
|
||||
|
||||
self.stdout.write(self.style.SUCCESS(
|
||||
f' ✓ Auto-approved and published report: {report.title}'
|
||||
))
|
||||
|
||||
Reference in New Issue
Block a user