This commit is contained in:
Iliyan Angelov
2025-11-26 22:32:20 +02:00
commit ed94dd22dd
150 changed files with 14058 additions and 0 deletions

0
osint/__init__.py Normal file
View File

246
osint/admin.py Normal file
View File

@@ -0,0 +1,246 @@
"""
Admin configuration for osint app.
"""
from django.contrib import admin
from django.utils.html import format_html
from django.urls import reverse
from django.utils import timezone
from datetime import timedelta
from .models import (
OSINTTask, OSINTResult, OSINTConfiguration,
SeedWebsite, OSINTKeyword, CrawledContent, AutoGeneratedReport
)
@admin.register(OSINTTask)
class OSINTTaskAdmin(admin.ModelAdmin):
"""OSINT task admin."""
list_display = ('report', 'task_type', 'status', 'created_at', 'completed_at')
list_filter = ('task_type', 'status', 'created_at')
search_fields = ('report__title', 'error_message')
readonly_fields = ('created_at', 'started_at', 'completed_at')
date_hierarchy = 'created_at'
@admin.register(OSINTResult)
class OSINTResultAdmin(admin.ModelAdmin):
"""OSINT result admin."""
list_display = ('report', 'source', 'data_type', 'confidence_level', 'is_verified', 'collected_at')
list_filter = ('data_type', 'is_verified', 'collected_at')
search_fields = ('report__title', 'source')
readonly_fields = ('collected_at', 'updated_at')
date_hierarchy = 'collected_at'
@admin.register(OSINTConfiguration)
class OSINTConfigurationAdmin(admin.ModelAdmin):
"""OSINT configuration admin."""
list_display = ('service_name', 'is_active', 'rate_limit', 'updated_at')
list_filter = ('is_active',)
search_fields = ('service_name',)
@admin.register(SeedWebsite)
class SeedWebsiteAdmin(admin.ModelAdmin):
"""Seed website admin."""
list_display = ('name', 'url', 'is_active', 'priority', 'last_crawled_at', 'pages_crawled', 'matches_found', 'status_indicator')
list_filter = ('is_active', 'priority', 'created_at')
search_fields = ('name', 'url', 'description')
readonly_fields = ('last_crawled_at', 'pages_crawled', 'matches_found', 'created_at', 'updated_at')
fieldsets = (
('Basic Information', {
'fields': ('name', 'url', 'description', 'is_active', 'priority', 'created_by')
}),
('Crawling Configuration', {
'fields': ('crawl_depth', 'crawl_interval_hours', 'allowed_domains', 'user_agent')
}),
('Statistics', {
'fields': ('last_crawled_at', 'pages_crawled', 'matches_found'),
'classes': ('collapse',)
}),
('Timestamps', {
'fields': ('created_at', 'updated_at'),
'classes': ('collapse',)
}),
)
date_hierarchy = 'created_at'
def status_indicator(self, obj):
"""Show visual status indicator."""
if not obj.is_active:
return format_html('<span style="color: red;">●</span> Inactive')
if not obj.last_crawled_at:
return format_html('<span style="color: orange;">●</span> Never Crawled')
hours_since = (timezone.now() - obj.last_crawled_at).total_seconds() / 3600
if hours_since > obj.crawl_interval_hours * 2:
return format_html('<span style="color: orange;">●</span> Overdue')
elif hours_since > obj.crawl_interval_hours:
return format_html('<span style="color: yellow;">●</span> Due Soon')
else:
return format_html('<span style="color: green;">●</span> Up to Date')
status_indicator.short_description = 'Status'
def save_model(self, request, obj, form, change):
if not change: # New object
obj.created_by = request.user
super().save_model(request, obj, form, change)
@admin.register(OSINTKeyword)
class OSINTKeywordAdmin(admin.ModelAdmin):
"""OSINT keyword admin."""
list_display = ('name', 'keyword', 'keyword_type', 'is_active', 'confidence_score', 'auto_approve', 'match_count')
list_filter = ('is_active', 'keyword_type', 'auto_approve', 'created_at')
search_fields = ('name', 'keyword', 'description')
readonly_fields = ('created_at', 'updated_at', 'match_count')
fieldsets = (
('Basic Information', {
'fields': ('name', 'keyword', 'description', 'keyword_type', 'is_active', 'created_by')
}),
('Matching Configuration', {
'fields': ('case_sensitive', 'confidence_score', 'auto_approve')
}),
('Statistics', {
'fields': ('match_count',),
'classes': ('collapse',)
}),
('Timestamps', {
'fields': ('created_at', 'updated_at'),
'classes': ('collapse',)
}),
)
date_hierarchy = 'created_at'
def match_count(self, obj):
"""Count how many times this keyword has matched."""
return obj.matched_contents.count()
match_count.short_description = 'Total Matches'
def save_model(self, request, obj, form, change):
if not change: # New object
obj.created_by = request.user
super().save_model(request, obj, form, change)
@admin.register(CrawledContent)
class CrawledContentAdmin(admin.ModelAdmin):
"""Crawled content admin."""
list_display = ('title', 'url', 'seed_website', 'match_count', 'confidence_score', 'has_potential_scam', 'crawled_at')
list_filter = ('has_potential_scam', 'seed_website', 'crawled_at', 'http_status')
search_fields = ('title', 'url', 'content')
readonly_fields = ('crawled_at', 'content_hash', 'http_status')
fieldsets = (
('Content Information', {
'fields': ('seed_website', 'url', 'title', 'content', 'html_content')
}),
('Analysis', {
'fields': ('matched_keywords', 'match_count', 'confidence_score', 'has_potential_scam')
}),
('Metadata', {
'fields': ('http_status', 'content_hash', 'crawled_at'),
'classes': ('collapse',)
}),
)
date_hierarchy = 'crawled_at'
filter_horizontal = ('matched_keywords',)
def get_queryset(self, request):
return super().get_queryset(request).select_related('seed_website').prefetch_related('matched_keywords')
@admin.register(AutoGeneratedReport)
class AutoGeneratedReportAdmin(admin.ModelAdmin):
"""Auto-generated report admin."""
list_display = ('title', 'source_url', 'status', 'confidence_score', 'reviewed_by', 'reviewed_at', 'view_report_link')
list_filter = ('status', 'confidence_score', 'created_at', 'reviewed_at')
search_fields = ('title', 'description', 'source_url')
readonly_fields = ('crawled_content', 'created_at', 'updated_at', 'published_at')
fieldsets = (
('Report Information', {
'fields': ('crawled_content', 'title', 'description', 'source_url')
}),
('Analysis', {
'fields': ('matched_keywords', 'confidence_score')
}),
('Review', {
'fields': ('status', 'review_notes', 'reviewed_by', 'reviewed_at', 'report')
}),
('Publication', {
'fields': ('published_at',),
'classes': ('collapse',)
}),
('Timestamps', {
'fields': ('created_at', 'updated_at'),
'classes': ('collapse',)
}),
)
date_hierarchy = 'created_at'
filter_horizontal = ('matched_keywords',)
actions = ['approve_reports', 'reject_reports', 'publish_reports']
def view_report_link(self, obj):
"""Link to the generated report if exists."""
if obj.report:
url = reverse('admin:reports_scamreport_change', args=[obj.report.pk])
return format_html('<a href="{}">View Report #{}</a>', url, obj.report.pk)
return '-'
view_report_link.short_description = 'Linked Report'
def get_queryset(self, request):
return super().get_queryset(request).select_related(
'crawled_content', 'reviewed_by', 'report'
).prefetch_related('matched_keywords')
@admin.action(description='Approve selected reports')
def approve_reports(self, request, queryset):
"""Approve selected auto-generated reports."""
from django.utils import timezone
updated = queryset.filter(status='pending').update(
status='approved',
reviewed_by=request.user,
reviewed_at=timezone.now()
)
self.message_user(request, f'{updated} reports approved.')
@admin.action(description='Reject selected reports')
def reject_reports(self, request, queryset):
"""Reject selected auto-generated reports."""
from django.utils import timezone
updated = queryset.filter(status='pending').update(
status='rejected',
reviewed_by=request.user,
reviewed_at=timezone.now()
)
self.message_user(request, f'{updated} reports rejected.')
@admin.action(description='Publish selected reports')
def publish_reports(self, request, queryset):
"""Publish approved reports."""
from django.utils import timezone
from reports.models import ScamReport
from reports.models import ScamTag
published = 0
for auto_report in queryset.filter(status='approved'):
if not auto_report.report:
# Create the actual scam report
report = ScamReport.objects.create(
title=auto_report.title,
description=auto_report.description,
reported_url=auto_report.source_url,
scam_type='other', # Default type
status='verified', # Auto-verified since reviewed
verification_score=auto_report.confidence_score,
is_public=True,
is_anonymous=True, # System-generated
is_auto_discovered=True, # Mark as auto-discovered
reporter_ip=None, # System-generated
)
auto_report.report = report
auto_report.status = 'published'
auto_report.published_at = timezone.now()
auto_report.save()
published += 1
self.message_user(request, f'{published} reports published.')

6
osint/apps.py Normal file
View File

@@ -0,0 +1,6 @@
from django.apps import AppConfig
class OsintConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'osint'

97
osint/forms.py Normal file
View File

@@ -0,0 +1,97 @@
"""
Forms for OSINT app.
"""
import json
from django import forms
from django.core.exceptions import ValidationError
from .models import SeedWebsite, OSINTKeyword
class SeedWebsiteForm(forms.ModelForm):
"""Form for creating/editing seed websites."""
allowed_domains_text = forms.CharField(
required=False,
widget=forms.Textarea(attrs={
'class': 'form-control',
'rows': 3,
'placeholder': 'Enter domains separated by commas or as JSON array, e.g. example.com, subdomain.example.com\nOr: ["example.com", "subdomain.example.com"]'
}),
help_text='Enter domains separated by commas or as JSON array. Leave empty for same domain only.'
)
class Meta:
model = SeedWebsite
fields = [
'name', 'url', 'description', 'is_active', 'priority',
'crawl_depth', 'crawl_interval_hours', 'user_agent'
]
widgets = {
'name': forms.TextInput(attrs={'class': 'form-control'}),
'url': forms.URLInput(attrs={'class': 'form-control'}),
'description': forms.Textarea(attrs={'class': 'form-control', 'rows': 3}),
'is_active': forms.CheckboxInput(attrs={'class': 'form-check-input'}),
'priority': forms.Select(attrs={'class': 'form-control'}),
'crawl_depth': forms.NumberInput(attrs={'class': 'form-control', 'min': 0, 'max': 5}),
'crawl_interval_hours': forms.NumberInput(attrs={'class': 'form-control', 'min': 1}),
'user_agent': forms.TextInput(attrs={'class': 'form-control'}),
}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if self.instance and self.instance.pk and self.instance.allowed_domains:
# Convert list to text representation
if isinstance(self.instance.allowed_domains, list):
self.fields['allowed_domains_text'].initial = ', '.join(self.instance.allowed_domains)
else:
self.fields['allowed_domains_text'].initial = str(self.instance.allowed_domains)
def clean_allowed_domains_text(self):
text = self.cleaned_data.get('allowed_domains_text', '').strip()
if not text:
return []
# Try to parse as JSON first
try:
domains = json.loads(text)
if isinstance(domains, list):
return [str(d).strip() for d in domains if d]
except (json.JSONDecodeError, ValueError):
pass
# Otherwise, treat as comma-separated
domains = [d.strip() for d in text.split(',') if d.strip()]
return domains
def save(self, commit=True):
instance = super().save(commit=False)
instance.allowed_domains = self.cleaned_data.get('allowed_domains_text', [])
if commit:
instance.save()
return instance
class OSINTKeywordForm(forms.ModelForm):
"""Form for creating/editing OSINT keywords."""
class Meta:
model = OSINTKeyword
fields = [
'name', 'keyword', 'description', 'keyword_type', 'is_active',
'case_sensitive', 'confidence_score', 'auto_approve'
]
widgets = {
'name': forms.TextInput(attrs={'class': 'form-control'}),
'keyword': forms.Textarea(attrs={'class': 'form-control', 'rows': 2}),
'description': forms.Textarea(attrs={'class': 'form-control', 'rows': 2}),
'keyword_type': forms.Select(attrs={'class': 'form-control'}),
'is_active': forms.CheckboxInput(attrs={'class': 'form-check-input'}),
'case_sensitive': forms.CheckboxInput(attrs={'class': 'form-check-input'}),
'confidence_score': forms.NumberInput(attrs={
'class': 'form-control',
'min': 0,
'max': 100,
'step': 1
}),
'auto_approve': forms.CheckboxInput(attrs={'class': 'form-check-input'}),
}

View File

@@ -0,0 +1,2 @@
# Management package

View File

@@ -0,0 +1,2 @@
# Management commands package

View File

@@ -0,0 +1,360 @@
"""
Management command for OSINT crawling from seed websites.
"""
import re
import hashlib
import time
from urllib.parse import urljoin, urlparse
from django.core.management.base import BaseCommand
from django.utils import timezone
from django.db import transaction, models
from django.conf import settings
import requests
from bs4 import BeautifulSoup
from osint.models import SeedWebsite, OSINTKeyword, CrawledContent, AutoGeneratedReport
from reports.models import ScamReport
class Command(BaseCommand):
help = 'Crawl seed websites and search for scam-related keywords'
def add_arguments(self, parser):
parser.add_argument(
'--seed-id',
type=int,
help='Crawl specific seed website by ID',
)
parser.add_argument(
'--all',
action='store_true',
help='Crawl all active seed websites',
)
parser.add_argument(
'--force',
action='store_true',
help='Force crawl even if recently crawled',
)
parser.add_argument(
'--max-pages',
type=int,
default=50,
help='Maximum pages to crawl per seed website (default: 50)',
)
parser.add_argument(
'--delay',
type=float,
default=1.0,
help='Delay between requests in seconds (default: 1.0)',
)
def handle(self, *args, **options):
self.stdout.write(self.style.SUCCESS('Starting OSINT crawling...'))
# Get seed websites to crawl
if options['seed_id']:
seeds = SeedWebsite.objects.filter(id=options['seed_id'], is_active=True)
elif options['all']:
seeds = SeedWebsite.objects.filter(is_active=True)
else:
# Default: crawl websites that are due
now = timezone.now()
seeds = SeedWebsite.objects.filter(
is_active=True
).filter(
models.Q(last_crawled_at__isnull=True) |
models.Q(last_crawled_at__lt=now - timezone.timedelta(hours=models.F('crawl_interval_hours')))
)
if not seeds.exists():
self.stdout.write(self.style.WARNING('No seed websites to crawl.'))
return
# Get active keywords
keywords = OSINTKeyword.objects.filter(is_active=True)
if not keywords.exists():
self.stdout.write(self.style.WARNING('No active keywords configured.'))
return
self.stdout.write(f'Found {seeds.count()} seed website(s) to crawl')
self.stdout.write(f'Found {keywords.count()} active keyword(s)')
total_pages = 0
total_matches = 0
for seed in seeds:
self.stdout.write(f'\nCrawling: {seed.name} ({seed.url})')
pages, matches = self.crawl_seed(seed, keywords, options)
total_pages += pages
total_matches += matches
# Update seed website stats
seed.last_crawled_at = timezone.now()
seed.pages_crawled += pages
seed.matches_found += matches
seed.save()
self.stdout.write(self.style.SUCCESS(
f'\nCrawling completed! Total pages: {total_pages}, Total matches: {total_matches}'
))
def crawl_seed(self, seed, keywords, options):
"""Crawl a single seed website."""
max_pages = options['max_pages']
delay = options['delay']
pages_crawled = 0
matches_found = 0
# Parse base URL
parsed_base = urlparse(seed.url)
base_domain = f"{parsed_base.scheme}://{parsed_base.netloc}"
# Determine allowed domains
allowed_domains = seed.allowed_domains if seed.allowed_domains else [parsed_base.netloc]
# URLs to visit
visited_urls = set()
urls_to_visit = [(seed.url, 0)] # (url, depth)
session = requests.Session()
session.headers.update({
'User-Agent': seed.user_agent or 'Mozilla/5.0 (compatible; OSINTBot/1.0)'
})
while urls_to_visit and pages_crawled < max_pages:
url, depth = urls_to_visit.pop(0)
# Skip if already visited or too deep
if url in visited_urls or depth > seed.crawl_depth:
continue
# Check domain
parsed = urlparse(url)
if parsed.netloc not in allowed_domains:
continue
visited_urls.add(url)
try:
# Fetch page
self.stdout.write(f' Fetching: {url} (depth: {depth})')
response = session.get(url, timeout=10, allow_redirects=True)
response.raise_for_status()
# Parse content
soup = BeautifulSoup(response.text, 'lxml')
# Extract text content
# Remove script and style elements
for script in soup(["script", "style", "meta", "link"]):
script.decompose()
text_content = soup.get_text(separator=' ', strip=True)
title = soup.title.string if soup.title else ''
html_content = str(soup)
# Calculate content hash
content_hash = hashlib.sha256(text_content.encode('utf-8')).hexdigest()
# Check for duplicates
if CrawledContent.objects.filter(url=url, content_hash=content_hash).exists():
self.stdout.write(f' Skipping duplicate content')
continue
# Match keywords
matched_keywords = []
match_count = 0
for keyword_obj in keywords:
matches = self.match_keyword(keyword_obj, text_content, url, title)
if matches:
matched_keywords.append(keyword_obj)
match_count += len(matches)
# Calculate confidence score
confidence_score = self.calculate_confidence(matched_keywords, match_count)
has_potential_scam = confidence_score >= 30 # Threshold
# Save crawled content
with transaction.atomic():
crawled_content = CrawledContent.objects.create(
seed_website=seed,
url=url,
title=title[:500],
content=text_content[:10000], # Limit content size
html_content=html_content[:50000], # Limit HTML size
match_count=match_count,
confidence_score=confidence_score,
has_potential_scam=has_potential_scam,
http_status=response.status_code,
content_hash=content_hash
)
crawled_content.matched_keywords.set(matched_keywords)
pages_crawled += 1
if has_potential_scam:
matches_found += 1
self.stdout.write(self.style.WARNING(
f' ⚠ Potential scam detected! Confidence: {confidence_score}%'
))
# Create auto-generated report
self.create_auto_report(crawled_content, matched_keywords, confidence_score)
# Extract links for further crawling
if depth < seed.crawl_depth:
for link in soup.find_all('a', href=True):
href = link['href']
absolute_url = urljoin(url, href)
parsed_link = urlparse(absolute_url)
# Only follow same-domain links
if parsed_link.netloc in allowed_domains:
if absolute_url not in visited_urls:
urls_to_visit.append((absolute_url, depth + 1))
# Rate limiting
time.sleep(delay)
except requests.RequestException as e:
self.stdout.write(self.style.ERROR(f' Error fetching {url}: {e}'))
continue
except Exception as e:
self.stdout.write(self.style.ERROR(f' Error processing {url}: {e}'))
continue
return pages_crawled, matches_found
def match_keyword(self, keyword_obj, text, url, title):
"""Match a keyword against text content."""
keyword = keyword_obj.keyword
flags = 0 if keyword_obj.case_sensitive else re.IGNORECASE
matches = []
if keyword_obj.keyword_type == 'exact':
if keyword_obj.case_sensitive:
if keyword in text or keyword in url or keyword in title:
matches.append(keyword)
else:
if keyword.lower() in text.lower() or keyword.lower() in url.lower() or keyword.lower() in title.lower():
matches.append(keyword)
elif keyword_obj.keyword_type == 'regex':
try:
pattern = re.compile(keyword, flags)
matches = pattern.findall(text + ' ' + url + ' ' + title)
except re.error:
self.stdout.write(self.style.ERROR(f' Invalid regex: {keyword}'))
elif keyword_obj.keyword_type == 'phrase':
# Phrase matching (word boundaries)
pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', flags)
matches = pattern.findall(text + ' ' + url + ' ' + title)
elif keyword_obj.keyword_type == 'domain':
# Domain pattern matching
pattern = re.compile(keyword, flags)
matches = pattern.findall(url)
elif keyword_obj.keyword_type == 'email':
# Email pattern
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', flags)
found_emails = email_pattern.findall(text + ' ' + url)
# Check if any email matches the keyword pattern
pattern = re.compile(keyword, flags)
matches = [email for email in found_emails if pattern.search(email)]
elif keyword_obj.keyword_type == 'phone':
# Phone pattern
phone_pattern = re.compile(r'[\+]?[(]?[0-9]{1,4}[)]?[-\s\.]?[(]?[0-9]{1,4}[)]?[-\s\.]?[0-9]{1,9}', flags)
found_phones = phone_pattern.findall(text)
# Check if any phone matches the keyword pattern
pattern = re.compile(keyword, flags)
matches = [phone for phone in found_phones if pattern.search(phone)]
return matches
def calculate_confidence(self, matched_keywords, match_count):
"""Calculate confidence score based on matched keywords."""
if not matched_keywords:
return 0
# Base score from keyword confidence scores
base_score = sum(kw.confidence_score for kw in matched_keywords) / len(matched_keywords)
# Boost for multiple matches
match_boost = min(match_count * 2, 30) # Max 30 point boost
# Boost for multiple different keywords
keyword_boost = min(len(matched_keywords) * 5, 20) # Max 20 point boost
total_score = base_score + match_boost + keyword_boost
return min(int(total_score), 100) # Cap at 100
def create_auto_report(self, crawled_content, matched_keywords, confidence_score):
"""Create an auto-generated report from crawled content."""
# Check if report already exists
if AutoGeneratedReport.objects.filter(crawled_content=crawled_content).exists():
return
# Generate title
title = f"Potential Scam Detected: {crawled_content.title or crawled_content.url}"
if len(title) > 500:
title = title[:497] + '...'
# Generate description
description = f"Automatically detected potential scam from OSINT crawling.\n\n"
description += f"Source URL: {crawled_content.url}\n"
description += f"Matched Keywords: {', '.join(kw.name for kw in matched_keywords)}\n"
description += f"Confidence Score: {confidence_score}%\n\n"
# Extract relevant snippet
content_preview = crawled_content.content[:500] + '...' if len(crawled_content.content) > 500 else crawled_content.content
description += f"Content Preview:\n{content_preview}"
# Determine if should auto-approve
status = 'pending'
if confidence_score >= 80 and any(kw.auto_approve for kw in matched_keywords):
status = 'approved'
# Create auto-generated report
auto_report = AutoGeneratedReport.objects.create(
crawled_content=crawled_content,
title=title,
description=description,
source_url=crawled_content.url,
confidence_score=confidence_score,
status=status
)
auto_report.matched_keywords.set(matched_keywords)
# If auto-approved, create the actual report
if status == 'approved':
self.create_scam_report(auto_report)
def create_scam_report(self, auto_report):
"""Create actual scam report from auto-generated report."""
from reports.models import ScamReport
report = ScamReport.objects.create(
title=auto_report.title,
description=auto_report.description,
reported_url=auto_report.source_url,
scam_type='other', # Default type, can be updated by moderator
status='verified', # Auto-verified since reviewed
verification_score=auto_report.confidence_score,
is_public=True,
is_anonymous=True, # System-generated
is_auto_discovered=True, # Mark as auto-discovered
)
auto_report.report = report
auto_report.status = 'published'
auto_report.published_at = timezone.now()
auto_report.save()
self.stdout.write(self.style.SUCCESS(
f' ✓ Auto-approved and published report: {report.title}'
))

View File

@@ -0,0 +1,80 @@
# Generated by Django 5.2.8 on 2025-11-26 13:41
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
('reports', '0001_initial'),
]
operations = [
migrations.CreateModel(
name='OSINTConfiguration',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('service_name', models.CharField(max_length=100, unique=True)),
('api_key', models.CharField(blank=True, help_text='Encrypted API key', max_length=255)),
('api_url', models.URLField(blank=True)),
('is_active', models.BooleanField(default=True)),
('rate_limit', models.IntegerField(default=100, help_text='Requests per hour')),
('configuration', models.JSONField(blank=True, default=dict, help_text='Additional configuration')),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
],
options={
'verbose_name': 'OSINT Configuration',
'verbose_name_plural': 'OSINT Configurations',
'db_table': 'osint_osintconfiguration',
},
),
migrations.CreateModel(
name='OSINTResult',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('source', models.CharField(help_text='OSINT source/service name', max_length=100)),
('data_type', models.CharField(choices=[('whois', 'WHOIS Data'), ('dns', 'DNS Records'), ('ssl', 'SSL Certificate'), ('archive', 'Archive Data'), ('email', 'Email Data'), ('phone', 'Phone Data'), ('business', 'Business Registry Data'), ('social', 'Social Media Data'), ('reputation', 'Reputation Data')], max_length=50)),
('raw_data', models.JSONField(default=dict, help_text='Raw data from OSINT source')),
('processed_data', models.JSONField(blank=True, default=dict, help_text='Processed/cleaned data')),
('confidence_level', models.IntegerField(default=0, help_text='Confidence level (0-100)')),
('is_verified', models.BooleanField(default=False, help_text='Manually verified by moderator')),
('collected_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('report', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='osint_results', to='reports.scamreport')),
],
options={
'verbose_name': 'OSINT Result',
'verbose_name_plural': 'OSINT Results',
'db_table': 'osint_osintresult',
'ordering': ['-collected_at'],
'indexes': [models.Index(fields=['report', 'data_type'], name='osint_osint_report__4a95b0_idx'), models.Index(fields=['confidence_level', 'is_verified'], name='osint_osint_confide_47552d_idx')],
},
),
migrations.CreateModel(
name='OSINTTask',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('task_type', models.CharField(choices=[('domain_analysis', 'Domain Analysis'), ('url_analysis', 'URL Analysis'), ('email_analysis', 'Email Analysis'), ('phone_analysis', 'Phone Analysis'), ('whois_lookup', 'WHOIS Lookup'), ('dns_lookup', 'DNS Lookup'), ('ssl_check', 'SSL Certificate Check'), ('archive_check', 'Archive Check'), ('business_registry', 'Business Registry Check'), ('social_media', 'Social Media Check')], max_length=50)),
('status', models.CharField(choices=[('pending', 'Pending'), ('running', 'Running'), ('completed', 'Completed'), ('failed', 'Failed'), ('cancelled', 'Cancelled')], default='pending', max_length=20)),
('parameters', models.JSONField(default=dict, help_text='Task parameters (e.g., URL, email, phone)')),
('result', models.JSONField(blank=True, default=dict, help_text='Task result data')),
('error_message', models.TextField(blank=True)),
('created_at', models.DateTimeField(auto_now_add=True)),
('started_at', models.DateTimeField(blank=True, null=True)),
('completed_at', models.DateTimeField(blank=True, null=True)),
('retry_count', models.IntegerField(default=0)),
('report', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='osint_tasks', to='reports.scamreport')),
],
options={
'verbose_name': 'OSINT Task',
'verbose_name_plural': 'OSINT Tasks',
'db_table': 'osint_osinttask',
'ordering': ['-created_at'],
'indexes': [models.Index(fields=['status', 'created_at'], name='osint_osint_status_290802_idx'), models.Index(fields=['report', 'task_type'], name='osint_osint_report__e7bd16_idx')],
},
),
]

View File

@@ -0,0 +1,157 @@
# Generated by Django 5.2.8 on 2025-11-26 18:03
import django.db.models.deletion
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('osint', '0001_initial'),
('reports', '0002_scamreport_is_auto_discovered'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.CreateModel(
name='OSINTKeyword',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('keyword', models.CharField(help_text='Keyword, phrase, or regex pattern to search for', max_length=500)),
('name', models.CharField(help_text='Friendly name for this keyword', max_length=200)),
('description', models.TextField(blank=True, help_text='Description of what this keyword detects')),
('keyword_type', models.CharField(choices=[('exact', 'Exact Match'), ('regex', 'Regular Expression'), ('phrase', 'Phrase Match'), ('domain', 'Domain Pattern'), ('email', 'Email Pattern'), ('phone', 'Phone Pattern')], default='phrase', help_text='Type of matching to perform', max_length=20)),
('is_active', models.BooleanField(default=True, help_text='Enable/disable this keyword')),
('case_sensitive', models.BooleanField(default=False, help_text='Case sensitive matching')),
('confidence_score', models.IntegerField(default=50, help_text='Default confidence score (0-100) when this keyword matches')),
('auto_approve', models.BooleanField(default=False, help_text='Auto-approve reports matching this keyword (requires high confidence)')),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('created_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='created_keywords', to=settings.AUTH_USER_MODEL)),
],
options={
'verbose_name': 'OSINT Keyword',
'verbose_name_plural': 'OSINT Keywords',
'db_table': 'osint_keyword',
'ordering': ['-is_active', 'name'],
},
),
migrations.CreateModel(
name='CrawledContent',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('url', models.URLField(help_text='URL of the crawled page', max_length=1000)),
('title', models.CharField(blank=True, help_text='Page title', max_length=500)),
('content', models.TextField(help_text='Crawled page content')),
('html_content', models.TextField(blank=True, help_text='Raw HTML content')),
('match_count', models.IntegerField(default=0, help_text='Number of keyword matches found')),
('confidence_score', models.IntegerField(default=0, help_text='Calculated confidence score based on matches')),
('has_potential_scam', models.BooleanField(default=False, help_text='Flagged as potential scam based on keyword matches')),
('crawled_at', models.DateTimeField(auto_now_add=True)),
('http_status', models.IntegerField(blank=True, help_text='HTTP status code', null=True)),
('content_hash', models.CharField(blank=True, help_text='SHA256 hash of content for deduplication', max_length=64)),
('matched_keywords', models.ManyToManyField(blank=True, help_text='Keywords that matched this content', related_name='matched_contents', to='osint.osintkeyword')),
],
options={
'verbose_name': 'Crawled Content',
'verbose_name_plural': 'Crawled Contents',
'db_table': 'osint_crawledcontent',
'ordering': ['-crawled_at', '-confidence_score'],
},
),
migrations.CreateModel(
name='AutoGeneratedReport',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('title', models.CharField(help_text='Auto-generated report title', max_length=500)),
('description', models.TextField(help_text='Auto-generated report description')),
('source_url', models.URLField(help_text='Source URL where scam was found', max_length=1000)),
('confidence_score', models.IntegerField(default=0, help_text='Confidence score (0-100)')),
('status', models.CharField(choices=[('pending', 'Pending Review'), ('approved', 'Approved'), ('rejected', 'Rejected'), ('published', 'Published')], default='pending', help_text='Review status', max_length=20)),
('review_notes', models.TextField(blank=True, help_text='Notes from moderator/admin review')),
('reviewed_at', models.DateTimeField(blank=True, null=True)),
('published_at', models.DateTimeField(blank=True, null=True)),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('report', models.ForeignKey(blank=True, help_text='Linked scam report (created when approved)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='auto_generated_reports', to='reports.scamreport')),
('reviewed_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='reviewed_auto_reports', to=settings.AUTH_USER_MODEL)),
('crawled_content', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, related_name='auto_report', to='osint.crawledcontent')),
('matched_keywords', models.ManyToManyField(related_name='generated_reports', to='osint.osintkeyword')),
],
options={
'verbose_name': 'Auto-Generated Report',
'verbose_name_plural': 'Auto-Generated Reports',
'db_table': 'osint_autogeneratedreport',
'ordering': ['-created_at', '-confidence_score'],
},
),
migrations.CreateModel(
name='SeedWebsite',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('url', models.URLField(help_text='Base URL to crawl', max_length=500)),
('name', models.CharField(help_text='Friendly name for this seed website', max_length=200)),
('description', models.TextField(blank=True, help_text='Description of the website')),
('is_active', models.BooleanField(default=True, help_text='Enable/disable crawling for this website')),
('priority', models.CharField(choices=[('high', 'High'), ('medium', 'Medium'), ('low', 'Low')], default='medium', help_text='Crawling priority', max_length=10)),
('crawl_depth', models.IntegerField(default=2, help_text='Maximum depth to crawl (0 = only this page, 1 = this page + direct links, etc.)')),
('crawl_interval_hours', models.IntegerField(default=24, help_text='Hours between crawls')),
('allowed_domains', models.JSONField(blank=True, default=list, help_text='List of allowed domains to crawl (empty = same domain only)')),
('user_agent', models.CharField(blank=True, default='Mozilla/5.0 (compatible; OSINTBot/1.0)', help_text='User agent string for requests', max_length=255)),
('last_crawled_at', models.DateTimeField(blank=True, null=True)),
('pages_crawled', models.IntegerField(default=0)),
('matches_found', models.IntegerField(default=0)),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('created_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='created_seed_websites', to=settings.AUTH_USER_MODEL)),
],
options={
'verbose_name': 'Seed Website',
'verbose_name_plural': 'Seed Websites',
'db_table': 'osint_seedwebsite',
'ordering': ['-priority', '-last_crawled_at'],
},
),
migrations.AddField(
model_name='crawledcontent',
name='seed_website',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='crawled_contents', to='osint.seedwebsite'),
),
migrations.AddIndex(
model_name='osintkeyword',
index=models.Index(fields=['is_active', 'keyword_type'], name='osint_keywo_is_acti_6f4814_idx'),
),
migrations.AddIndex(
model_name='autogeneratedreport',
index=models.Index(fields=['status', 'confidence_score'], name='osint_autog_status_a8a215_idx'),
),
migrations.AddIndex(
model_name='autogeneratedreport',
index=models.Index(fields=['created_at'], name='osint_autog_created_07e2b0_idx'),
),
migrations.AddIndex(
model_name='seedwebsite',
index=models.Index(fields=['is_active', 'priority'], name='osint_seedw_is_acti_411fa2_idx'),
),
migrations.AddIndex(
model_name='seedwebsite',
index=models.Index(fields=['last_crawled_at'], name='osint_seedw_last_cr_673111_idx'),
),
migrations.AddIndex(
model_name='crawledcontent',
index=models.Index(fields=['seed_website', 'crawled_at'], name='osint_crawl_seed_we_eb78f4_idx'),
),
migrations.AddIndex(
model_name='crawledcontent',
index=models.Index(fields=['has_potential_scam', 'confidence_score'], name='osint_crawl_has_pot_9317d0_idx'),
),
migrations.AddIndex(
model_name='crawledcontent',
index=models.Index(fields=['content_hash'], name='osint_crawl_content_17d05a_idx'),
),
migrations.AlterUniqueTogether(
name='crawledcontent',
unique_together={('url', 'content_hash')},
),
]

View File

468
osint/models.py Normal file
View File

@@ -0,0 +1,468 @@
"""
OSINT (Open Source Intelligence) integration models.
"""
from django.db import models
from django.contrib.auth import get_user_model
from reports.models import ScamReport
User = get_user_model()
class OSINTTask(models.Model):
"""
Background tasks for OSINT data collection.
"""
TASK_TYPE_CHOICES = [
('domain_analysis', 'Domain Analysis'),
('url_analysis', 'URL Analysis'),
('email_analysis', 'Email Analysis'),
('phone_analysis', 'Phone Analysis'),
('whois_lookup', 'WHOIS Lookup'),
('dns_lookup', 'DNS Lookup'),
('ssl_check', 'SSL Certificate Check'),
('archive_check', 'Archive Check'),
('business_registry', 'Business Registry Check'),
('social_media', 'Social Media Check'),
]
STATUS_CHOICES = [
('pending', 'Pending'),
('running', 'Running'),
('completed', 'Completed'),
('failed', 'Failed'),
('cancelled', 'Cancelled'),
]
report = models.ForeignKey(
ScamReport,
on_delete=models.CASCADE,
related_name='osint_tasks'
)
task_type = models.CharField(
max_length=50,
choices=TASK_TYPE_CHOICES
)
status = models.CharField(
max_length=20,
choices=STATUS_CHOICES,
default='pending'
)
parameters = models.JSONField(
default=dict,
help_text='Task parameters (e.g., URL, email, phone)'
)
result = models.JSONField(
default=dict,
blank=True,
help_text='Task result data'
)
error_message = models.TextField(blank=True)
created_at = models.DateTimeField(auto_now_add=True)
started_at = models.DateTimeField(null=True, blank=True)
completed_at = models.DateTimeField(null=True, blank=True)
retry_count = models.IntegerField(default=0)
class Meta:
db_table = 'osint_osinttask'
verbose_name = 'OSINT Task'
verbose_name_plural = 'OSINT Tasks'
ordering = ['-created_at']
indexes = [
models.Index(fields=['status', 'created_at']),
models.Index(fields=['report', 'task_type']),
]
def __str__(self):
return f"{self.get_task_type_display()} for Report #{self.report.id} - {self.get_status_display()}"
class OSINTResult(models.Model):
"""
OSINT investigation results.
"""
DATA_TYPE_CHOICES = [
('whois', 'WHOIS Data'),
('dns', 'DNS Records'),
('ssl', 'SSL Certificate'),
('archive', 'Archive Data'),
('email', 'Email Data'),
('phone', 'Phone Data'),
('business', 'Business Registry Data'),
('social', 'Social Media Data'),
('reputation', 'Reputation Data'),
]
report = models.ForeignKey(
ScamReport,
on_delete=models.CASCADE,
related_name='osint_results'
)
source = models.CharField(
max_length=100,
help_text='OSINT source/service name'
)
data_type = models.CharField(
max_length=50,
choices=DATA_TYPE_CHOICES
)
raw_data = models.JSONField(
default=dict,
help_text='Raw data from OSINT source'
)
processed_data = models.JSONField(
default=dict,
blank=True,
help_text='Processed/cleaned data'
)
confidence_level = models.IntegerField(
default=0,
help_text='Confidence level (0-100)'
)
is_verified = models.BooleanField(
default=False,
help_text='Manually verified by moderator'
)
collected_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
db_table = 'osint_osintresult'
verbose_name = 'OSINT Result'
verbose_name_plural = 'OSINT Results'
ordering = ['-collected_at']
indexes = [
models.Index(fields=['report', 'data_type']),
models.Index(fields=['confidence_level', 'is_verified']),
]
def __str__(self):
return f"{self.get_data_type_display()} from {self.source} for Report #{self.report.id}"
class OSINTConfiguration(models.Model):
"""
Configuration for OSINT services and APIs.
"""
service_name = models.CharField(max_length=100, unique=True)
api_key = models.CharField(
max_length=255,
blank=True,
help_text='Encrypted API key'
)
api_url = models.URLField(blank=True)
is_active = models.BooleanField(default=True)
rate_limit = models.IntegerField(
default=100,
help_text='Requests per hour'
)
configuration = models.JSONField(
default=dict,
blank=True,
help_text='Additional configuration'
)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
db_table = 'osint_osintconfiguration'
verbose_name = 'OSINT Configuration'
verbose_name_plural = 'OSINT Configurations'
def __str__(self):
return f"{self.service_name} ({'Active' if self.is_active else 'Inactive'})"
class SeedWebsite(models.Model):
"""
Seed websites for OSINT crawling.
"""
PRIORITY_CHOICES = [
('high', 'High'),
('medium', 'Medium'),
('low', 'Low'),
]
url = models.URLField(
max_length=500,
help_text='Base URL to crawl'
)
name = models.CharField(
max_length=200,
help_text='Friendly name for this seed website'
)
description = models.TextField(
blank=True,
help_text='Description of the website'
)
is_active = models.BooleanField(
default=True,
help_text='Enable/disable crawling for this website'
)
priority = models.CharField(
max_length=10,
choices=PRIORITY_CHOICES,
default='medium',
help_text='Crawling priority'
)
crawl_depth = models.IntegerField(
default=2,
help_text='Maximum depth to crawl (0 = only this page, 1 = this page + direct links, etc.)'
)
crawl_interval_hours = models.IntegerField(
default=24,
help_text='Hours between crawls'
)
allowed_domains = models.JSONField(
default=list,
blank=True,
help_text='List of allowed domains to crawl (empty = same domain only)'
)
user_agent = models.CharField(
max_length=255,
blank=True,
default='Mozilla/5.0 (compatible; OSINTBot/1.0)',
help_text='User agent string for requests'
)
last_crawled_at = models.DateTimeField(null=True, blank=True)
pages_crawled = models.IntegerField(default=0)
matches_found = models.IntegerField(default=0)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
created_by = models.ForeignKey(
User,
on_delete=models.SET_NULL,
null=True,
related_name='created_seed_websites'
)
class Meta:
db_table = 'osint_seedwebsite'
verbose_name = 'Seed Website'
verbose_name_plural = 'Seed Websites'
ordering = ['-priority', '-last_crawled_at']
indexes = [
models.Index(fields=['is_active', 'priority']),
models.Index(fields=['last_crawled_at']),
]
def __str__(self):
return f"{self.name} ({self.url})"
class OSINTKeyword(models.Model):
"""
Keywords and patterns to search for during OSINT crawling.
"""
TYPE_CHOICES = [
('exact', 'Exact Match'),
('regex', 'Regular Expression'),
('phrase', 'Phrase Match'),
('domain', 'Domain Pattern'),
('email', 'Email Pattern'),
('phone', 'Phone Pattern'),
]
keyword = models.CharField(
max_length=500,
help_text='Keyword, phrase, or regex pattern to search for'
)
name = models.CharField(
max_length=200,
help_text='Friendly name for this keyword'
)
description = models.TextField(
blank=True,
help_text='Description of what this keyword detects'
)
keyword_type = models.CharField(
max_length=20,
choices=TYPE_CHOICES,
default='phrase',
help_text='Type of matching to perform'
)
is_active = models.BooleanField(
default=True,
help_text='Enable/disable this keyword'
)
case_sensitive = models.BooleanField(
default=False,
help_text='Case sensitive matching'
)
confidence_score = models.IntegerField(
default=50,
help_text='Default confidence score (0-100) when this keyword matches'
)
auto_approve = models.BooleanField(
default=False,
help_text='Auto-approve reports matching this keyword (requires high confidence)'
)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
created_by = models.ForeignKey(
User,
on_delete=models.SET_NULL,
null=True,
related_name='created_keywords'
)
class Meta:
db_table = 'osint_keyword'
verbose_name = 'OSINT Keyword'
verbose_name_plural = 'OSINT Keywords'
ordering = ['-is_active', 'name']
indexes = [
models.Index(fields=['is_active', 'keyword_type']),
]
def __str__(self):
return f"{self.name} ({self.keyword_type})"
class CrawledContent(models.Model):
"""
Content crawled from seed websites.
"""
seed_website = models.ForeignKey(
SeedWebsite,
on_delete=models.CASCADE,
related_name='crawled_contents'
)
url = models.URLField(
max_length=1000,
help_text='URL of the crawled page'
)
title = models.CharField(
max_length=500,
blank=True,
help_text='Page title'
)
content = models.TextField(
help_text='Crawled page content'
)
html_content = models.TextField(
blank=True,
help_text='Raw HTML content'
)
matched_keywords = models.ManyToManyField(
OSINTKeyword,
blank=True,
related_name='matched_contents',
help_text='Keywords that matched this content'
)
match_count = models.IntegerField(
default=0,
help_text='Number of keyword matches found'
)
confidence_score = models.IntegerField(
default=0,
help_text='Calculated confidence score based on matches'
)
has_potential_scam = models.BooleanField(
default=False,
help_text='Flagged as potential scam based on keyword matches'
)
crawled_at = models.DateTimeField(auto_now_add=True)
http_status = models.IntegerField(
null=True,
blank=True,
help_text='HTTP status code'
)
content_hash = models.CharField(
max_length=64,
blank=True,
help_text='SHA256 hash of content for deduplication'
)
class Meta:
db_table = 'osint_crawledcontent'
verbose_name = 'Crawled Content'
verbose_name_plural = 'Crawled Contents'
ordering = ['-crawled_at', '-confidence_score']
indexes = [
models.Index(fields=['seed_website', 'crawled_at']),
models.Index(fields=['has_potential_scam', 'confidence_score']),
models.Index(fields=['content_hash']),
]
unique_together = [['url', 'content_hash']]
def __str__(self):
return f"{self.title or self.url} - {self.match_count} matches"
class AutoGeneratedReport(models.Model):
"""
Automatically generated scam reports from OSINT crawling.
"""
STATUS_CHOICES = [
('pending', 'Pending Review'),
('approved', 'Approved'),
('rejected', 'Rejected'),
('published', 'Published'),
]
crawled_content = models.OneToOneField(
CrawledContent,
on_delete=models.CASCADE,
related_name='auto_report'
)
report = models.ForeignKey(
ScamReport,
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='auto_generated_reports',
help_text='Linked scam report (created when approved)'
)
title = models.CharField(
max_length=500,
help_text='Auto-generated report title'
)
description = models.TextField(
help_text='Auto-generated report description'
)
source_url = models.URLField(
max_length=1000,
help_text='Source URL where scam was found'
)
matched_keywords = models.ManyToManyField(
OSINTKeyword,
related_name='generated_reports'
)
confidence_score = models.IntegerField(
default=0,
help_text='Confidence score (0-100)'
)
status = models.CharField(
max_length=20,
choices=STATUS_CHOICES,
default='pending',
help_text='Review status'
)
review_notes = models.TextField(
blank=True,
help_text='Notes from moderator/admin review'
)
reviewed_by = models.ForeignKey(
User,
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='reviewed_auto_reports'
)
reviewed_at = models.DateTimeField(null=True, blank=True)
published_at = models.DateTimeField(null=True, blank=True)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
db_table = 'osint_autogeneratedreport'
verbose_name = 'Auto-Generated Report'
verbose_name_plural = 'Auto-Generated Reports'
ordering = ['-created_at', '-confidence_score']
indexes = [
models.Index(fields=['status', 'confidence_score']),
models.Index(fields=['created_at']),
]
def __str__(self):
return f"{self.title} - {self.get_status_display()}"

75
osint/tasks.py Normal file
View File

@@ -0,0 +1,75 @@
"""
Celery tasks for OSINT crawling.
"""
from celery import shared_task
from django.core.management import call_command
from django.utils import timezone
from datetime import timedelta
from .models import SeedWebsite, AutoGeneratedReport
@shared_task
def crawl_osint_seeds():
"""
Periodic task to crawl all due seed websites.
This should be scheduled to run periodically (e.g., every hour).
"""
try:
call_command('crawl_osint', '--all', verbosity=0)
return "OSINT crawling completed successfully"
except Exception as e:
return f"OSINT crawling failed: {str(e)}"
@shared_task
def crawl_specific_seed(seed_id):
"""
Crawl a specific seed website.
"""
try:
call_command('crawl_osint', '--seed-id', str(seed_id), verbosity=0)
return f"Seed website {seed_id} crawled successfully"
except Exception as e:
return f"Seed website {seed_id} crawling failed: {str(e)}"
@shared_task
def auto_approve_high_confidence_reports():
"""
Auto-approve reports with very high confidence scores and auto-approve keywords.
"""
from reports.models import ScamReport
# Get auto-reports that should be auto-approved
auto_reports = AutoGeneratedReport.objects.filter(
status='pending',
confidence_score__gte=80
).prefetch_related('matched_keywords')
approved_count = 0
for auto_report in auto_reports:
# Check if any matched keyword has auto_approve enabled
if any(kw.auto_approve for kw in auto_report.matched_keywords.all()):
# Approve and publish
from osint.views import ApproveAutoReportView
# Create report directly
report = ScamReport.objects.create(
title=auto_report.title,
description=auto_report.description,
reported_url=auto_report.source_url,
scam_type='other',
status='verified',
verification_score=auto_report.confidence_score,
is_public=True,
is_anonymous=True,
is_auto_discovered=True, # Mark as auto-discovered
)
auto_report.report = report
auto_report.status = 'published'
auto_report.published_at = timezone.now()
auto_report.save()
approved_count += 1
return f"Auto-approved {approved_count} reports"

3
osint/tests.py Normal file
View File

@@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

35
osint/urls.py Normal file
View File

@@ -0,0 +1,35 @@
"""
URL configuration for osint app.
"""
from django.urls import path
from . import views
app_name = 'osint'
urlpatterns = [
# Admin Dashboard (Main OSINT Management)
path('admin-dashboard/', views.OSINTAdminDashboardView.as_view(), name='admin_dashboard'),
# Seed Website Management
path('admin-dashboard/seeds/add/', views.SeedWebsiteCreateView.as_view(), name='seed_create'),
path('admin-dashboard/seeds/<int:pk>/edit/', views.SeedWebsiteUpdateView.as_view(), name='seed_edit'),
path('admin-dashboard/seeds/<int:pk>/delete/', views.SeedWebsiteDeleteView.as_view(), name='seed_delete'),
# Keyword Management
path('admin-dashboard/keywords/add/', views.OSINTKeywordCreateView.as_view(), name='keyword_create'),
path('admin-dashboard/keywords/<int:pk>/edit/', views.OSINTKeywordUpdateView.as_view(), name='keyword_edit'),
path('admin-dashboard/keywords/<int:pk>/delete/', views.OSINTKeywordDeleteView.as_view(), name='keyword_delete'),
# Crawling Control
path('admin-dashboard/start-crawling/', views.StartCrawlingView.as_view(), name='start_crawling'),
# Legacy/Moderator Views
path('tasks/', views.OSINTTaskListView.as_view(), name='task_list'),
path('tasks/<int:pk>/', views.OSINTTaskDetailView.as_view(), name='task_detail'),
path('results/<int:report_id>/', views.OSINTResultListView.as_view(), name='result_list'),
path('auto-reports/', views.AutoReportListView.as_view(), name='auto_report_list'),
path('auto-reports/<int:pk>/', views.AutoReportDetailView.as_view(), name='auto_report_detail'),
path('auto-reports/<int:pk>/approve/', views.ApproveAutoReportView.as_view(), name='approve_auto_report'),
path('auto-reports/<int:pk>/reject/', views.RejectAutoReportView.as_view(), name='reject_auto_report'),
]

346
osint/views.py Normal file
View File

@@ -0,0 +1,346 @@
"""
Views for osint app.
"""
from django.shortcuts import get_object_or_404, redirect
from django.views.generic import ListView, DetailView, UpdateView, TemplateView, CreateView, DeleteView
from django.contrib.auth.mixins import LoginRequiredMixin, UserPassesTestMixin
from django.contrib.messages.views import SuccessMessageMixin
from django.contrib import messages
from django.urls import reverse_lazy
from django.utils import timezone
from django.db import transaction
from django.db.models import Count, Q
from django.http import JsonResponse
from django.core.management import call_command
from django.core.management.base import CommandError
import subprocess
import threading
from reports.models import ScamReport
from .models import OSINTTask, OSINTResult, AutoGeneratedReport, SeedWebsite, OSINTKeyword, CrawledContent
from .forms import SeedWebsiteForm, OSINTKeywordForm
class ModeratorRequiredMixin(UserPassesTestMixin):
"""Mixin to require moderator role."""
def test_func(self):
return self.request.user.is_authenticated and self.request.user.is_moderator()
class OSINTTaskListView(LoginRequiredMixin, ModeratorRequiredMixin, ListView):
"""List OSINT tasks."""
model = OSINTTask
template_name = 'osint/task_list.html'
context_object_name = 'tasks'
paginate_by = 50
def get_queryset(self):
status = self.request.GET.get('status', '')
queryset = OSINTTask.objects.select_related('report')
if status:
queryset = queryset.filter(status=status)
return queryset.order_by('-created_at')
class OSINTTaskDetailView(LoginRequiredMixin, ModeratorRequiredMixin, DetailView):
"""View OSINT task details."""
model = OSINTTask
template_name = 'osint/task_detail.html'
context_object_name = 'task'
class OSINTResultListView(LoginRequiredMixin, ModeratorRequiredMixin, ListView):
"""List OSINT results for a report."""
model = OSINTResult
template_name = 'osint/result_list.html'
context_object_name = 'results'
def get_queryset(self):
report = get_object_or_404(ScamReport, pk=self.kwargs['report_id'])
return OSINTResult.objects.filter(report=report).order_by('-collected_at')
def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs)
context['report'] = get_object_or_404(ScamReport, pk=self.kwargs['report_id'])
return context
class AutoReportListView(LoginRequiredMixin, ModeratorRequiredMixin, ListView):
"""List auto-generated reports for review."""
model = AutoGeneratedReport
template_name = 'osint/auto_report_list.html'
context_object_name = 'auto_reports'
paginate_by = 20
def get_queryset(self):
status = self.request.GET.get('status', 'pending')
queryset = AutoGeneratedReport.objects.select_related(
'crawled_content', 'reviewed_by', 'report'
).prefetch_related('matched_keywords')
if status:
queryset = queryset.filter(status=status)
return queryset.order_by('-confidence_score', '-created_at')
def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs)
context['pending_count'] = AutoGeneratedReport.objects.filter(status='pending').count()
context['approved_count'] = AutoGeneratedReport.objects.filter(status='approved').count()
context['published_count'] = AutoGeneratedReport.objects.filter(status='published').count()
context['rejected_count'] = AutoGeneratedReport.objects.filter(status='rejected').count()
return context
class AutoReportDetailView(LoginRequiredMixin, ModeratorRequiredMixin, DetailView):
"""View auto-generated report details."""
model = AutoGeneratedReport
template_name = 'osint/auto_report_detail.html'
context_object_name = 'auto_report'
def get_queryset(self):
return AutoGeneratedReport.objects.select_related(
'crawled_content', 'crawled_content__seed_website',
'reviewed_by', 'report'
).prefetch_related('matched_keywords')
class ApproveAutoReportView(LoginRequiredMixin, ModeratorRequiredMixin, SuccessMessageMixin, UpdateView):
"""Approve an auto-generated report."""
model = AutoGeneratedReport
fields = []
template_name = 'osint/approve_auto_report.html'
success_message = "Auto-generated report approved successfully!"
def form_valid(self, form):
auto_report = form.instance
with transaction.atomic():
# Update auto report
auto_report.status = 'approved'
auto_report.reviewed_by = self.request.user
auto_report.reviewed_at = timezone.now()
auto_report.save()
# Create the actual scam report
from reports.models import ScamReport
report = ScamReport.objects.create(
title=auto_report.title,
description=auto_report.description,
reported_url=auto_report.source_url,
scam_type='other', # Default, can be updated
status='verified',
verification_score=auto_report.confidence_score,
is_public=True,
is_anonymous=True, # System-generated
is_auto_discovered=True, # Mark as auto-discovered
)
auto_report.report = report
auto_report.status = 'published'
auto_report.published_at = timezone.now()
auto_report.save()
return super().form_valid(form)
def get_success_url(self):
return reverse_lazy('osint:auto_report_list')
class RejectAutoReportView(LoginRequiredMixin, ModeratorRequiredMixin, SuccessMessageMixin, UpdateView):
"""Reject an auto-generated report."""
model = AutoGeneratedReport
fields = []
template_name = 'osint/reject_auto_report.html'
success_message = "Auto-generated report rejected."
def form_valid(self, form):
auto_report = form.instance
auto_report.status = 'rejected'
auto_report.reviewed_by = self.request.user
auto_report.reviewed_at = timezone.now()
auto_report.review_notes = self.request.POST.get('review_notes', '').strip()
auto_report.save()
return super().form_valid(form)
def get_success_url(self):
return reverse_lazy('osint:auto_report_list')
class AdminRequiredMixin(UserPassesTestMixin):
"""Mixin to require admin role."""
def test_func(self):
return self.request.user.is_authenticated and self.request.user.is_administrator()
class OSINTAdminDashboardView(LoginRequiredMixin, AdminRequiredMixin, TemplateView):
"""Comprehensive OSINT admin dashboard."""
template_name = 'osint/admin_dashboard.html'
def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs)
now = timezone.now()
# Seed Website Statistics
context['total_seeds'] = SeedWebsite.objects.count()
context['active_seeds'] = SeedWebsite.objects.filter(is_active=True).count()
context['seed_websites'] = SeedWebsite.objects.all().order_by('-priority', '-last_crawled_at')[:10]
# Keyword Statistics
context['total_keywords'] = OSINTKeyword.objects.count()
context['active_keywords'] = OSINTKeyword.objects.filter(is_active=True).count()
context['keywords'] = OSINTKeyword.objects.all().order_by('-is_active', 'name')[:10]
# Crawling Statistics
context['total_crawled'] = CrawledContent.objects.count()
context['potential_scams'] = CrawledContent.objects.filter(has_potential_scam=True).count()
context['recent_crawled'] = CrawledContent.objects.order_by('-crawled_at')[:5]
# Auto-Report Statistics
context['pending_reports'] = AutoGeneratedReport.objects.filter(status='pending').count()
context['approved_reports'] = AutoGeneratedReport.objects.filter(status='approved').count()
context['published_reports'] = AutoGeneratedReport.objects.filter(status='published').count()
context['rejected_reports'] = AutoGeneratedReport.objects.filter(status='rejected').count()
context['recent_auto_reports'] = AutoGeneratedReport.objects.order_by('-created_at')[:5]
# Overall Statistics
context['total_pages_crawled'] = SeedWebsite.objects.aggregate(
total=Count('pages_crawled')
)['total'] or 0
context['total_matches'] = SeedWebsite.objects.aggregate(
total=Count('matches_found')
)['total'] or 0
# Seed websites due for crawling
due_seeds = []
for seed in SeedWebsite.objects.filter(is_active=True):
if not seed.last_crawled_at:
due_seeds.append(seed)
else:
hours_since = (now - seed.last_crawled_at).total_seconds() / 3600
if hours_since >= seed.crawl_interval_hours:
due_seeds.append(seed)
context['due_for_crawling'] = due_seeds[:5]
return context
class SeedWebsiteCreateView(LoginRequiredMixin, AdminRequiredMixin, SuccessMessageMixin, CreateView):
"""Create a new seed website."""
model = SeedWebsite
form_class = SeedWebsiteForm
template_name = 'osint/seed_website_form.html'
success_message = "Seed website created successfully!"
def form_valid(self, form):
form.instance.created_by = self.request.user
return super().form_valid(form)
def get_success_url(self):
return reverse_lazy('osint:admin_dashboard')
class SeedWebsiteUpdateView(LoginRequiredMixin, AdminRequiredMixin, SuccessMessageMixin, UpdateView):
"""Update a seed website."""
model = SeedWebsite
form_class = SeedWebsiteForm
template_name = 'osint/seed_website_form.html'
success_message = "Seed website updated successfully!"
def get_success_url(self):
return reverse_lazy('osint:admin_dashboard')
class SeedWebsiteDeleteView(LoginRequiredMixin, AdminRequiredMixin, SuccessMessageMixin, DeleteView):
"""Delete a seed website."""
model = SeedWebsite
template_name = 'osint/seed_website_confirm_delete.html'
success_message = "Seed website deleted successfully!"
def get_success_url(self):
return reverse_lazy('osint:admin_dashboard')
class OSINTKeywordCreateView(LoginRequiredMixin, AdminRequiredMixin, SuccessMessageMixin, CreateView):
"""Create a new OSINT keyword."""
model = OSINTKeyword
form_class = OSINTKeywordForm
template_name = 'osint/keyword_form.html'
success_message = "Keyword created successfully!"
def form_valid(self, form):
form.instance.created_by = self.request.user
return super().form_valid(form)
def get_success_url(self):
return reverse_lazy('osint:admin_dashboard')
class OSINTKeywordUpdateView(LoginRequiredMixin, AdminRequiredMixin, SuccessMessageMixin, UpdateView):
"""Update an OSINT keyword."""
model = OSINTKeyword
form_class = OSINTKeywordForm
template_name = 'osint/keyword_form.html'
success_message = "Keyword updated successfully!"
def get_success_url(self):
return reverse_lazy('osint:admin_dashboard')
class OSINTKeywordDeleteView(LoginRequiredMixin, AdminRequiredMixin, SuccessMessageMixin, DeleteView):
"""Delete an OSINT keyword."""
model = OSINTKeyword
template_name = 'osint/keyword_confirm_delete.html'
success_message = "Keyword deleted successfully!"
def get_success_url(self):
return reverse_lazy('osint:admin_dashboard')
class StartCrawlingView(LoginRequiredMixin, AdminRequiredMixin, TemplateView):
"""Start OSINT crawling."""
template_name = 'osint/start_crawling.html'
def post(self, request, *args, **kwargs):
seed_id = request.POST.get('seed_id')
max_pages = request.POST.get('max_pages', 50)
delay = request.POST.get('delay', 1.0)
def run_crawl():
import sys
import os
import django
from django.db import connections
# Ensure Django is set up for this thread
django.setup()
try:
if seed_id:
call_command('crawl_osint', '--seed-id', str(seed_id),
'--max-pages', str(max_pages), '--delay', str(delay), verbosity=1)
else:
call_command('crawl_osint', '--all',
'--max-pages', str(max_pages), '--delay', str(delay), verbosity=1)
except Exception as e:
# Log error to a file or database for debugging
import traceback
error_msg = f"Crawling error: {str(e)}\n{traceback.format_exc()}"
print(error_msg, file=sys.stderr)
# You could also log to a file or database here
finally:
# Close database connections
connections.close_all()
# Run in background thread
thread = threading.Thread(target=run_crawl)
thread.daemon = True
thread.start()
messages.success(request, f'Crawling started in background. Check results in a few minutes. (Max pages: {max_pages}, Delay: {delay}s)')
return redirect('osint:admin_dashboard')
def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs)
context['seed_websites'] = SeedWebsite.objects.filter(is_active=True)
return context