update
This commit is contained in:
0
osint/__init__.py
Normal file
0
osint/__init__.py
Normal file
246
osint/admin.py
Normal file
246
osint/admin.py
Normal file
@@ -0,0 +1,246 @@
|
||||
"""
|
||||
Admin configuration for osint app.
|
||||
"""
|
||||
from django.contrib import admin
|
||||
from django.utils.html import format_html
|
||||
from django.urls import reverse
|
||||
from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
from .models import (
|
||||
OSINTTask, OSINTResult, OSINTConfiguration,
|
||||
SeedWebsite, OSINTKeyword, CrawledContent, AutoGeneratedReport
|
||||
)
|
||||
|
||||
|
||||
@admin.register(OSINTTask)
|
||||
class OSINTTaskAdmin(admin.ModelAdmin):
|
||||
"""OSINT task admin."""
|
||||
list_display = ('report', 'task_type', 'status', 'created_at', 'completed_at')
|
||||
list_filter = ('task_type', 'status', 'created_at')
|
||||
search_fields = ('report__title', 'error_message')
|
||||
readonly_fields = ('created_at', 'started_at', 'completed_at')
|
||||
date_hierarchy = 'created_at'
|
||||
|
||||
|
||||
@admin.register(OSINTResult)
|
||||
class OSINTResultAdmin(admin.ModelAdmin):
|
||||
"""OSINT result admin."""
|
||||
list_display = ('report', 'source', 'data_type', 'confidence_level', 'is_verified', 'collected_at')
|
||||
list_filter = ('data_type', 'is_verified', 'collected_at')
|
||||
search_fields = ('report__title', 'source')
|
||||
readonly_fields = ('collected_at', 'updated_at')
|
||||
date_hierarchy = 'collected_at'
|
||||
|
||||
|
||||
@admin.register(OSINTConfiguration)
|
||||
class OSINTConfigurationAdmin(admin.ModelAdmin):
|
||||
"""OSINT configuration admin."""
|
||||
list_display = ('service_name', 'is_active', 'rate_limit', 'updated_at')
|
||||
list_filter = ('is_active',)
|
||||
search_fields = ('service_name',)
|
||||
|
||||
|
||||
@admin.register(SeedWebsite)
|
||||
class SeedWebsiteAdmin(admin.ModelAdmin):
|
||||
"""Seed website admin."""
|
||||
list_display = ('name', 'url', 'is_active', 'priority', 'last_crawled_at', 'pages_crawled', 'matches_found', 'status_indicator')
|
||||
list_filter = ('is_active', 'priority', 'created_at')
|
||||
search_fields = ('name', 'url', 'description')
|
||||
readonly_fields = ('last_crawled_at', 'pages_crawled', 'matches_found', 'created_at', 'updated_at')
|
||||
fieldsets = (
|
||||
('Basic Information', {
|
||||
'fields': ('name', 'url', 'description', 'is_active', 'priority', 'created_by')
|
||||
}),
|
||||
('Crawling Configuration', {
|
||||
'fields': ('crawl_depth', 'crawl_interval_hours', 'allowed_domains', 'user_agent')
|
||||
}),
|
||||
('Statistics', {
|
||||
'fields': ('last_crawled_at', 'pages_crawled', 'matches_found'),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('created_at', 'updated_at'),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
)
|
||||
date_hierarchy = 'created_at'
|
||||
|
||||
def status_indicator(self, obj):
|
||||
"""Show visual status indicator."""
|
||||
if not obj.is_active:
|
||||
return format_html('<span style="color: red;">●</span> Inactive')
|
||||
if not obj.last_crawled_at:
|
||||
return format_html('<span style="color: orange;">●</span> Never Crawled')
|
||||
|
||||
hours_since = (timezone.now() - obj.last_crawled_at).total_seconds() / 3600
|
||||
if hours_since > obj.crawl_interval_hours * 2:
|
||||
return format_html('<span style="color: orange;">●</span> Overdue')
|
||||
elif hours_since > obj.crawl_interval_hours:
|
||||
return format_html('<span style="color: yellow;">●</span> Due Soon')
|
||||
else:
|
||||
return format_html('<span style="color: green;">●</span> Up to Date')
|
||||
status_indicator.short_description = 'Status'
|
||||
|
||||
def save_model(self, request, obj, form, change):
|
||||
if not change: # New object
|
||||
obj.created_by = request.user
|
||||
super().save_model(request, obj, form, change)
|
||||
|
||||
|
||||
@admin.register(OSINTKeyword)
|
||||
class OSINTKeywordAdmin(admin.ModelAdmin):
|
||||
"""OSINT keyword admin."""
|
||||
list_display = ('name', 'keyword', 'keyword_type', 'is_active', 'confidence_score', 'auto_approve', 'match_count')
|
||||
list_filter = ('is_active', 'keyword_type', 'auto_approve', 'created_at')
|
||||
search_fields = ('name', 'keyword', 'description')
|
||||
readonly_fields = ('created_at', 'updated_at', 'match_count')
|
||||
fieldsets = (
|
||||
('Basic Information', {
|
||||
'fields': ('name', 'keyword', 'description', 'keyword_type', 'is_active', 'created_by')
|
||||
}),
|
||||
('Matching Configuration', {
|
||||
'fields': ('case_sensitive', 'confidence_score', 'auto_approve')
|
||||
}),
|
||||
('Statistics', {
|
||||
'fields': ('match_count',),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('created_at', 'updated_at'),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
)
|
||||
date_hierarchy = 'created_at'
|
||||
|
||||
def match_count(self, obj):
|
||||
"""Count how many times this keyword has matched."""
|
||||
return obj.matched_contents.count()
|
||||
match_count.short_description = 'Total Matches'
|
||||
|
||||
def save_model(self, request, obj, form, change):
|
||||
if not change: # New object
|
||||
obj.created_by = request.user
|
||||
super().save_model(request, obj, form, change)
|
||||
|
||||
|
||||
@admin.register(CrawledContent)
|
||||
class CrawledContentAdmin(admin.ModelAdmin):
|
||||
"""Crawled content admin."""
|
||||
list_display = ('title', 'url', 'seed_website', 'match_count', 'confidence_score', 'has_potential_scam', 'crawled_at')
|
||||
list_filter = ('has_potential_scam', 'seed_website', 'crawled_at', 'http_status')
|
||||
search_fields = ('title', 'url', 'content')
|
||||
readonly_fields = ('crawled_at', 'content_hash', 'http_status')
|
||||
fieldsets = (
|
||||
('Content Information', {
|
||||
'fields': ('seed_website', 'url', 'title', 'content', 'html_content')
|
||||
}),
|
||||
('Analysis', {
|
||||
'fields': ('matched_keywords', 'match_count', 'confidence_score', 'has_potential_scam')
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ('http_status', 'content_hash', 'crawled_at'),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
)
|
||||
date_hierarchy = 'crawled_at'
|
||||
filter_horizontal = ('matched_keywords',)
|
||||
|
||||
def get_queryset(self, request):
|
||||
return super().get_queryset(request).select_related('seed_website').prefetch_related('matched_keywords')
|
||||
|
||||
|
||||
@admin.register(AutoGeneratedReport)
|
||||
class AutoGeneratedReportAdmin(admin.ModelAdmin):
|
||||
"""Auto-generated report admin."""
|
||||
list_display = ('title', 'source_url', 'status', 'confidence_score', 'reviewed_by', 'reviewed_at', 'view_report_link')
|
||||
list_filter = ('status', 'confidence_score', 'created_at', 'reviewed_at')
|
||||
search_fields = ('title', 'description', 'source_url')
|
||||
readonly_fields = ('crawled_content', 'created_at', 'updated_at', 'published_at')
|
||||
fieldsets = (
|
||||
('Report Information', {
|
||||
'fields': ('crawled_content', 'title', 'description', 'source_url')
|
||||
}),
|
||||
('Analysis', {
|
||||
'fields': ('matched_keywords', 'confidence_score')
|
||||
}),
|
||||
('Review', {
|
||||
'fields': ('status', 'review_notes', 'reviewed_by', 'reviewed_at', 'report')
|
||||
}),
|
||||
('Publication', {
|
||||
'fields': ('published_at',),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('created_at', 'updated_at'),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
)
|
||||
date_hierarchy = 'created_at'
|
||||
filter_horizontal = ('matched_keywords',)
|
||||
actions = ['approve_reports', 'reject_reports', 'publish_reports']
|
||||
|
||||
def view_report_link(self, obj):
|
||||
"""Link to the generated report if exists."""
|
||||
if obj.report:
|
||||
url = reverse('admin:reports_scamreport_change', args=[obj.report.pk])
|
||||
return format_html('<a href="{}">View Report #{}</a>', url, obj.report.pk)
|
||||
return '-'
|
||||
view_report_link.short_description = 'Linked Report'
|
||||
|
||||
def get_queryset(self, request):
|
||||
return super().get_queryset(request).select_related(
|
||||
'crawled_content', 'reviewed_by', 'report'
|
||||
).prefetch_related('matched_keywords')
|
||||
|
||||
@admin.action(description='Approve selected reports')
|
||||
def approve_reports(self, request, queryset):
|
||||
"""Approve selected auto-generated reports."""
|
||||
from django.utils import timezone
|
||||
updated = queryset.filter(status='pending').update(
|
||||
status='approved',
|
||||
reviewed_by=request.user,
|
||||
reviewed_at=timezone.now()
|
||||
)
|
||||
self.message_user(request, f'{updated} reports approved.')
|
||||
|
||||
@admin.action(description='Reject selected reports')
|
||||
def reject_reports(self, request, queryset):
|
||||
"""Reject selected auto-generated reports."""
|
||||
from django.utils import timezone
|
||||
updated = queryset.filter(status='pending').update(
|
||||
status='rejected',
|
||||
reviewed_by=request.user,
|
||||
reviewed_at=timezone.now()
|
||||
)
|
||||
self.message_user(request, f'{updated} reports rejected.')
|
||||
|
||||
@admin.action(description='Publish selected reports')
|
||||
def publish_reports(self, request, queryset):
|
||||
"""Publish approved reports."""
|
||||
from django.utils import timezone
|
||||
from reports.models import ScamReport
|
||||
from reports.models import ScamTag
|
||||
|
||||
published = 0
|
||||
for auto_report in queryset.filter(status='approved'):
|
||||
if not auto_report.report:
|
||||
# Create the actual scam report
|
||||
report = ScamReport.objects.create(
|
||||
title=auto_report.title,
|
||||
description=auto_report.description,
|
||||
reported_url=auto_report.source_url,
|
||||
scam_type='other', # Default type
|
||||
status='verified', # Auto-verified since reviewed
|
||||
verification_score=auto_report.confidence_score,
|
||||
is_public=True,
|
||||
is_anonymous=True, # System-generated
|
||||
is_auto_discovered=True, # Mark as auto-discovered
|
||||
reporter_ip=None, # System-generated
|
||||
)
|
||||
auto_report.report = report
|
||||
auto_report.status = 'published'
|
||||
auto_report.published_at = timezone.now()
|
||||
auto_report.save()
|
||||
published += 1
|
||||
|
||||
self.message_user(request, f'{published} reports published.')
|
||||
6
osint/apps.py
Normal file
6
osint/apps.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class OsintConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
name = 'osint'
|
||||
97
osint/forms.py
Normal file
97
osint/forms.py
Normal file
@@ -0,0 +1,97 @@
|
||||
"""
|
||||
Forms for OSINT app.
|
||||
"""
|
||||
import json
|
||||
from django import forms
|
||||
from django.core.exceptions import ValidationError
|
||||
from .models import SeedWebsite, OSINTKeyword
|
||||
|
||||
|
||||
class SeedWebsiteForm(forms.ModelForm):
|
||||
"""Form for creating/editing seed websites."""
|
||||
allowed_domains_text = forms.CharField(
|
||||
required=False,
|
||||
widget=forms.Textarea(attrs={
|
||||
'class': 'form-control',
|
||||
'rows': 3,
|
||||
'placeholder': 'Enter domains separated by commas or as JSON array, e.g. example.com, subdomain.example.com\nOr: ["example.com", "subdomain.example.com"]'
|
||||
}),
|
||||
help_text='Enter domains separated by commas or as JSON array. Leave empty for same domain only.'
|
||||
)
|
||||
|
||||
class Meta:
|
||||
model = SeedWebsite
|
||||
fields = [
|
||||
'name', 'url', 'description', 'is_active', 'priority',
|
||||
'crawl_depth', 'crawl_interval_hours', 'user_agent'
|
||||
]
|
||||
widgets = {
|
||||
'name': forms.TextInput(attrs={'class': 'form-control'}),
|
||||
'url': forms.URLInput(attrs={'class': 'form-control'}),
|
||||
'description': forms.Textarea(attrs={'class': 'form-control', 'rows': 3}),
|
||||
'is_active': forms.CheckboxInput(attrs={'class': 'form-check-input'}),
|
||||
'priority': forms.Select(attrs={'class': 'form-control'}),
|
||||
'crawl_depth': forms.NumberInput(attrs={'class': 'form-control', 'min': 0, 'max': 5}),
|
||||
'crawl_interval_hours': forms.NumberInput(attrs={'class': 'form-control', 'min': 1}),
|
||||
'user_agent': forms.TextInput(attrs={'class': 'form-control'}),
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
if self.instance and self.instance.pk and self.instance.allowed_domains:
|
||||
# Convert list to text representation
|
||||
if isinstance(self.instance.allowed_domains, list):
|
||||
self.fields['allowed_domains_text'].initial = ', '.join(self.instance.allowed_domains)
|
||||
else:
|
||||
self.fields['allowed_domains_text'].initial = str(self.instance.allowed_domains)
|
||||
|
||||
def clean_allowed_domains_text(self):
|
||||
text = self.cleaned_data.get('allowed_domains_text', '').strip()
|
||||
if not text:
|
||||
return []
|
||||
|
||||
# Try to parse as JSON first
|
||||
try:
|
||||
domains = json.loads(text)
|
||||
if isinstance(domains, list):
|
||||
return [str(d).strip() for d in domains if d]
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
|
||||
# Otherwise, treat as comma-separated
|
||||
domains = [d.strip() for d in text.split(',') if d.strip()]
|
||||
return domains
|
||||
|
||||
def save(self, commit=True):
|
||||
instance = super().save(commit=False)
|
||||
instance.allowed_domains = self.cleaned_data.get('allowed_domains_text', [])
|
||||
if commit:
|
||||
instance.save()
|
||||
return instance
|
||||
|
||||
|
||||
class OSINTKeywordForm(forms.ModelForm):
|
||||
"""Form for creating/editing OSINT keywords."""
|
||||
|
||||
class Meta:
|
||||
model = OSINTKeyword
|
||||
fields = [
|
||||
'name', 'keyword', 'description', 'keyword_type', 'is_active',
|
||||
'case_sensitive', 'confidence_score', 'auto_approve'
|
||||
]
|
||||
widgets = {
|
||||
'name': forms.TextInput(attrs={'class': 'form-control'}),
|
||||
'keyword': forms.Textarea(attrs={'class': 'form-control', 'rows': 2}),
|
||||
'description': forms.Textarea(attrs={'class': 'form-control', 'rows': 2}),
|
||||
'keyword_type': forms.Select(attrs={'class': 'form-control'}),
|
||||
'is_active': forms.CheckboxInput(attrs={'class': 'form-check-input'}),
|
||||
'case_sensitive': forms.CheckboxInput(attrs={'class': 'form-check-input'}),
|
||||
'confidence_score': forms.NumberInput(attrs={
|
||||
'class': 'form-control',
|
||||
'min': 0,
|
||||
'max': 100,
|
||||
'step': 1
|
||||
}),
|
||||
'auto_approve': forms.CheckboxInput(attrs={'class': 'form-check-input'}),
|
||||
}
|
||||
|
||||
2
osint/management/__init__.py
Normal file
2
osint/management/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# Management package
|
||||
|
||||
2
osint/management/commands/__init__.py
Normal file
2
osint/management/commands/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# Management commands package
|
||||
|
||||
360
osint/management/commands/crawl_osint.py
Normal file
360
osint/management/commands/crawl_osint.py
Normal file
@@ -0,0 +1,360 @@
|
||||
"""
|
||||
Management command for OSINT crawling from seed websites.
|
||||
"""
|
||||
import re
|
||||
import hashlib
|
||||
import time
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.utils import timezone
|
||||
from django.db import transaction, models
|
||||
from django.conf import settings
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from osint.models import SeedWebsite, OSINTKeyword, CrawledContent, AutoGeneratedReport
|
||||
from reports.models import ScamReport
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = 'Crawl seed websites and search for scam-related keywords'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
'--seed-id',
|
||||
type=int,
|
||||
help='Crawl specific seed website by ID',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--all',
|
||||
action='store_true',
|
||||
help='Crawl all active seed websites',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--force',
|
||||
action='store_true',
|
||||
help='Force crawl even if recently crawled',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--max-pages',
|
||||
type=int,
|
||||
default=50,
|
||||
help='Maximum pages to crawl per seed website (default: 50)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--delay',
|
||||
type=float,
|
||||
default=1.0,
|
||||
help='Delay between requests in seconds (default: 1.0)',
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
self.stdout.write(self.style.SUCCESS('Starting OSINT crawling...'))
|
||||
|
||||
# Get seed websites to crawl
|
||||
if options['seed_id']:
|
||||
seeds = SeedWebsite.objects.filter(id=options['seed_id'], is_active=True)
|
||||
elif options['all']:
|
||||
seeds = SeedWebsite.objects.filter(is_active=True)
|
||||
else:
|
||||
# Default: crawl websites that are due
|
||||
now = timezone.now()
|
||||
seeds = SeedWebsite.objects.filter(
|
||||
is_active=True
|
||||
).filter(
|
||||
models.Q(last_crawled_at__isnull=True) |
|
||||
models.Q(last_crawled_at__lt=now - timezone.timedelta(hours=models.F('crawl_interval_hours')))
|
||||
)
|
||||
|
||||
if not seeds.exists():
|
||||
self.stdout.write(self.style.WARNING('No seed websites to crawl.'))
|
||||
return
|
||||
|
||||
# Get active keywords
|
||||
keywords = OSINTKeyword.objects.filter(is_active=True)
|
||||
if not keywords.exists():
|
||||
self.stdout.write(self.style.WARNING('No active keywords configured.'))
|
||||
return
|
||||
|
||||
self.stdout.write(f'Found {seeds.count()} seed website(s) to crawl')
|
||||
self.stdout.write(f'Found {keywords.count()} active keyword(s)')
|
||||
|
||||
total_pages = 0
|
||||
total_matches = 0
|
||||
|
||||
for seed in seeds:
|
||||
self.stdout.write(f'\nCrawling: {seed.name} ({seed.url})')
|
||||
pages, matches = self.crawl_seed(seed, keywords, options)
|
||||
total_pages += pages
|
||||
total_matches += matches
|
||||
|
||||
# Update seed website stats
|
||||
seed.last_crawled_at = timezone.now()
|
||||
seed.pages_crawled += pages
|
||||
seed.matches_found += matches
|
||||
seed.save()
|
||||
|
||||
self.stdout.write(self.style.SUCCESS(
|
||||
f'\nCrawling completed! Total pages: {total_pages}, Total matches: {total_matches}'
|
||||
))
|
||||
|
||||
def crawl_seed(self, seed, keywords, options):
|
||||
"""Crawl a single seed website."""
|
||||
max_pages = options['max_pages']
|
||||
delay = options['delay']
|
||||
pages_crawled = 0
|
||||
matches_found = 0
|
||||
|
||||
# Parse base URL
|
||||
parsed_base = urlparse(seed.url)
|
||||
base_domain = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
||||
|
||||
# Determine allowed domains
|
||||
allowed_domains = seed.allowed_domains if seed.allowed_domains else [parsed_base.netloc]
|
||||
|
||||
# URLs to visit
|
||||
visited_urls = set()
|
||||
urls_to_visit = [(seed.url, 0)] # (url, depth)
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update({
|
||||
'User-Agent': seed.user_agent or 'Mozilla/5.0 (compatible; OSINTBot/1.0)'
|
||||
})
|
||||
|
||||
while urls_to_visit and pages_crawled < max_pages:
|
||||
url, depth = urls_to_visit.pop(0)
|
||||
|
||||
# Skip if already visited or too deep
|
||||
if url in visited_urls or depth > seed.crawl_depth:
|
||||
continue
|
||||
|
||||
# Check domain
|
||||
parsed = urlparse(url)
|
||||
if parsed.netloc not in allowed_domains:
|
||||
continue
|
||||
|
||||
visited_urls.add(url)
|
||||
|
||||
try:
|
||||
# Fetch page
|
||||
self.stdout.write(f' Fetching: {url} (depth: {depth})')
|
||||
response = session.get(url, timeout=10, allow_redirects=True)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse content
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
|
||||
# Extract text content
|
||||
# Remove script and style elements
|
||||
for script in soup(["script", "style", "meta", "link"]):
|
||||
script.decompose()
|
||||
|
||||
text_content = soup.get_text(separator=' ', strip=True)
|
||||
title = soup.title.string if soup.title else ''
|
||||
html_content = str(soup)
|
||||
|
||||
# Calculate content hash
|
||||
content_hash = hashlib.sha256(text_content.encode('utf-8')).hexdigest()
|
||||
|
||||
# Check for duplicates
|
||||
if CrawledContent.objects.filter(url=url, content_hash=content_hash).exists():
|
||||
self.stdout.write(f' Skipping duplicate content')
|
||||
continue
|
||||
|
||||
# Match keywords
|
||||
matched_keywords = []
|
||||
match_count = 0
|
||||
|
||||
for keyword_obj in keywords:
|
||||
matches = self.match_keyword(keyword_obj, text_content, url, title)
|
||||
if matches:
|
||||
matched_keywords.append(keyword_obj)
|
||||
match_count += len(matches)
|
||||
|
||||
# Calculate confidence score
|
||||
confidence_score = self.calculate_confidence(matched_keywords, match_count)
|
||||
has_potential_scam = confidence_score >= 30 # Threshold
|
||||
|
||||
# Save crawled content
|
||||
with transaction.atomic():
|
||||
crawled_content = CrawledContent.objects.create(
|
||||
seed_website=seed,
|
||||
url=url,
|
||||
title=title[:500],
|
||||
content=text_content[:10000], # Limit content size
|
||||
html_content=html_content[:50000], # Limit HTML size
|
||||
match_count=match_count,
|
||||
confidence_score=confidence_score,
|
||||
has_potential_scam=has_potential_scam,
|
||||
http_status=response.status_code,
|
||||
content_hash=content_hash
|
||||
)
|
||||
crawled_content.matched_keywords.set(matched_keywords)
|
||||
|
||||
pages_crawled += 1
|
||||
|
||||
if has_potential_scam:
|
||||
matches_found += 1
|
||||
self.stdout.write(self.style.WARNING(
|
||||
f' ⚠ Potential scam detected! Confidence: {confidence_score}%'
|
||||
))
|
||||
|
||||
# Create auto-generated report
|
||||
self.create_auto_report(crawled_content, matched_keywords, confidence_score)
|
||||
|
||||
# Extract links for further crawling
|
||||
if depth < seed.crawl_depth:
|
||||
for link in soup.find_all('a', href=True):
|
||||
href = link['href']
|
||||
absolute_url = urljoin(url, href)
|
||||
parsed_link = urlparse(absolute_url)
|
||||
|
||||
# Only follow same-domain links
|
||||
if parsed_link.netloc in allowed_domains:
|
||||
if absolute_url not in visited_urls:
|
||||
urls_to_visit.append((absolute_url, depth + 1))
|
||||
|
||||
# Rate limiting
|
||||
time.sleep(delay)
|
||||
|
||||
except requests.RequestException as e:
|
||||
self.stdout.write(self.style.ERROR(f' Error fetching {url}: {e}'))
|
||||
continue
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f' Error processing {url}: {e}'))
|
||||
continue
|
||||
|
||||
return pages_crawled, matches_found
|
||||
|
||||
def match_keyword(self, keyword_obj, text, url, title):
|
||||
"""Match a keyword against text content."""
|
||||
keyword = keyword_obj.keyword
|
||||
flags = 0 if keyword_obj.case_sensitive else re.IGNORECASE
|
||||
|
||||
matches = []
|
||||
|
||||
if keyword_obj.keyword_type == 'exact':
|
||||
if keyword_obj.case_sensitive:
|
||||
if keyword in text or keyword in url or keyword in title:
|
||||
matches.append(keyword)
|
||||
else:
|
||||
if keyword.lower() in text.lower() or keyword.lower() in url.lower() or keyword.lower() in title.lower():
|
||||
matches.append(keyword)
|
||||
|
||||
elif keyword_obj.keyword_type == 'regex':
|
||||
try:
|
||||
pattern = re.compile(keyword, flags)
|
||||
matches = pattern.findall(text + ' ' + url + ' ' + title)
|
||||
except re.error:
|
||||
self.stdout.write(self.style.ERROR(f' Invalid regex: {keyword}'))
|
||||
|
||||
elif keyword_obj.keyword_type == 'phrase':
|
||||
# Phrase matching (word boundaries)
|
||||
pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', flags)
|
||||
matches = pattern.findall(text + ' ' + url + ' ' + title)
|
||||
|
||||
elif keyword_obj.keyword_type == 'domain':
|
||||
# Domain pattern matching
|
||||
pattern = re.compile(keyword, flags)
|
||||
matches = pattern.findall(url)
|
||||
|
||||
elif keyword_obj.keyword_type == 'email':
|
||||
# Email pattern
|
||||
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', flags)
|
||||
found_emails = email_pattern.findall(text + ' ' + url)
|
||||
# Check if any email matches the keyword pattern
|
||||
pattern = re.compile(keyword, flags)
|
||||
matches = [email for email in found_emails if pattern.search(email)]
|
||||
|
||||
elif keyword_obj.keyword_type == 'phone':
|
||||
# Phone pattern
|
||||
phone_pattern = re.compile(r'[\+]?[(]?[0-9]{1,4}[)]?[-\s\.]?[(]?[0-9]{1,4}[)]?[-\s\.]?[0-9]{1,9}', flags)
|
||||
found_phones = phone_pattern.findall(text)
|
||||
# Check if any phone matches the keyword pattern
|
||||
pattern = re.compile(keyword, flags)
|
||||
matches = [phone for phone in found_phones if pattern.search(phone)]
|
||||
|
||||
return matches
|
||||
|
||||
def calculate_confidence(self, matched_keywords, match_count):
|
||||
"""Calculate confidence score based on matched keywords."""
|
||||
if not matched_keywords:
|
||||
return 0
|
||||
|
||||
# Base score from keyword confidence scores
|
||||
base_score = sum(kw.confidence_score for kw in matched_keywords) / len(matched_keywords)
|
||||
|
||||
# Boost for multiple matches
|
||||
match_boost = min(match_count * 2, 30) # Max 30 point boost
|
||||
|
||||
# Boost for multiple different keywords
|
||||
keyword_boost = min(len(matched_keywords) * 5, 20) # Max 20 point boost
|
||||
|
||||
total_score = base_score + match_boost + keyword_boost
|
||||
return min(int(total_score), 100) # Cap at 100
|
||||
|
||||
def create_auto_report(self, crawled_content, matched_keywords, confidence_score):
|
||||
"""Create an auto-generated report from crawled content."""
|
||||
# Check if report already exists
|
||||
if AutoGeneratedReport.objects.filter(crawled_content=crawled_content).exists():
|
||||
return
|
||||
|
||||
# Generate title
|
||||
title = f"Potential Scam Detected: {crawled_content.title or crawled_content.url}"
|
||||
if len(title) > 500:
|
||||
title = title[:497] + '...'
|
||||
|
||||
# Generate description
|
||||
description = f"Automatically detected potential scam from OSINT crawling.\n\n"
|
||||
description += f"Source URL: {crawled_content.url}\n"
|
||||
description += f"Matched Keywords: {', '.join(kw.name for kw in matched_keywords)}\n"
|
||||
description += f"Confidence Score: {confidence_score}%\n\n"
|
||||
|
||||
# Extract relevant snippet
|
||||
content_preview = crawled_content.content[:500] + '...' if len(crawled_content.content) > 500 else crawled_content.content
|
||||
description += f"Content Preview:\n{content_preview}"
|
||||
|
||||
# Determine if should auto-approve
|
||||
status = 'pending'
|
||||
if confidence_score >= 80 and any(kw.auto_approve for kw in matched_keywords):
|
||||
status = 'approved'
|
||||
|
||||
# Create auto-generated report
|
||||
auto_report = AutoGeneratedReport.objects.create(
|
||||
crawled_content=crawled_content,
|
||||
title=title,
|
||||
description=description,
|
||||
source_url=crawled_content.url,
|
||||
confidence_score=confidence_score,
|
||||
status=status
|
||||
)
|
||||
auto_report.matched_keywords.set(matched_keywords)
|
||||
|
||||
# If auto-approved, create the actual report
|
||||
if status == 'approved':
|
||||
self.create_scam_report(auto_report)
|
||||
|
||||
def create_scam_report(self, auto_report):
|
||||
"""Create actual scam report from auto-generated report."""
|
||||
from reports.models import ScamReport
|
||||
|
||||
report = ScamReport.objects.create(
|
||||
title=auto_report.title,
|
||||
description=auto_report.description,
|
||||
reported_url=auto_report.source_url,
|
||||
scam_type='other', # Default type, can be updated by moderator
|
||||
status='verified', # Auto-verified since reviewed
|
||||
verification_score=auto_report.confidence_score,
|
||||
is_public=True,
|
||||
is_anonymous=True, # System-generated
|
||||
is_auto_discovered=True, # Mark as auto-discovered
|
||||
)
|
||||
|
||||
auto_report.report = report
|
||||
auto_report.status = 'published'
|
||||
auto_report.published_at = timezone.now()
|
||||
auto_report.save()
|
||||
|
||||
self.stdout.write(self.style.SUCCESS(
|
||||
f' ✓ Auto-approved and published report: {report.title}'
|
||||
))
|
||||
|
||||
80
osint/migrations/0001_initial.py
Normal file
80
osint/migrations/0001_initial.py
Normal file
@@ -0,0 +1,80 @@
|
||||
# Generated by Django 5.2.8 on 2025-11-26 13:41
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
('reports', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='OSINTConfiguration',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('service_name', models.CharField(max_length=100, unique=True)),
|
||||
('api_key', models.CharField(blank=True, help_text='Encrypted API key', max_length=255)),
|
||||
('api_url', models.URLField(blank=True)),
|
||||
('is_active', models.BooleanField(default=True)),
|
||||
('rate_limit', models.IntegerField(default=100, help_text='Requests per hour')),
|
||||
('configuration', models.JSONField(blank=True, default=dict, help_text='Additional configuration')),
|
||||
('created_at', models.DateTimeField(auto_now_add=True)),
|
||||
('updated_at', models.DateTimeField(auto_now=True)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'OSINT Configuration',
|
||||
'verbose_name_plural': 'OSINT Configurations',
|
||||
'db_table': 'osint_osintconfiguration',
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='OSINTResult',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('source', models.CharField(help_text='OSINT source/service name', max_length=100)),
|
||||
('data_type', models.CharField(choices=[('whois', 'WHOIS Data'), ('dns', 'DNS Records'), ('ssl', 'SSL Certificate'), ('archive', 'Archive Data'), ('email', 'Email Data'), ('phone', 'Phone Data'), ('business', 'Business Registry Data'), ('social', 'Social Media Data'), ('reputation', 'Reputation Data')], max_length=50)),
|
||||
('raw_data', models.JSONField(default=dict, help_text='Raw data from OSINT source')),
|
||||
('processed_data', models.JSONField(blank=True, default=dict, help_text='Processed/cleaned data')),
|
||||
('confidence_level', models.IntegerField(default=0, help_text='Confidence level (0-100)')),
|
||||
('is_verified', models.BooleanField(default=False, help_text='Manually verified by moderator')),
|
||||
('collected_at', models.DateTimeField(auto_now_add=True)),
|
||||
('updated_at', models.DateTimeField(auto_now=True)),
|
||||
('report', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='osint_results', to='reports.scamreport')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'OSINT Result',
|
||||
'verbose_name_plural': 'OSINT Results',
|
||||
'db_table': 'osint_osintresult',
|
||||
'ordering': ['-collected_at'],
|
||||
'indexes': [models.Index(fields=['report', 'data_type'], name='osint_osint_report__4a95b0_idx'), models.Index(fields=['confidence_level', 'is_verified'], name='osint_osint_confide_47552d_idx')],
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='OSINTTask',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('task_type', models.CharField(choices=[('domain_analysis', 'Domain Analysis'), ('url_analysis', 'URL Analysis'), ('email_analysis', 'Email Analysis'), ('phone_analysis', 'Phone Analysis'), ('whois_lookup', 'WHOIS Lookup'), ('dns_lookup', 'DNS Lookup'), ('ssl_check', 'SSL Certificate Check'), ('archive_check', 'Archive Check'), ('business_registry', 'Business Registry Check'), ('social_media', 'Social Media Check')], max_length=50)),
|
||||
('status', models.CharField(choices=[('pending', 'Pending'), ('running', 'Running'), ('completed', 'Completed'), ('failed', 'Failed'), ('cancelled', 'Cancelled')], default='pending', max_length=20)),
|
||||
('parameters', models.JSONField(default=dict, help_text='Task parameters (e.g., URL, email, phone)')),
|
||||
('result', models.JSONField(blank=True, default=dict, help_text='Task result data')),
|
||||
('error_message', models.TextField(blank=True)),
|
||||
('created_at', models.DateTimeField(auto_now_add=True)),
|
||||
('started_at', models.DateTimeField(blank=True, null=True)),
|
||||
('completed_at', models.DateTimeField(blank=True, null=True)),
|
||||
('retry_count', models.IntegerField(default=0)),
|
||||
('report', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='osint_tasks', to='reports.scamreport')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'OSINT Task',
|
||||
'verbose_name_plural': 'OSINT Tasks',
|
||||
'db_table': 'osint_osinttask',
|
||||
'ordering': ['-created_at'],
|
||||
'indexes': [models.Index(fields=['status', 'created_at'], name='osint_osint_status_290802_idx'), models.Index(fields=['report', 'task_type'], name='osint_osint_report__e7bd16_idx')],
|
||||
},
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,157 @@
|
||||
# Generated by Django 5.2.8 on 2025-11-26 18:03
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('osint', '0001_initial'),
|
||||
('reports', '0002_scamreport_is_auto_discovered'),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='OSINTKeyword',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('keyword', models.CharField(help_text='Keyword, phrase, or regex pattern to search for', max_length=500)),
|
||||
('name', models.CharField(help_text='Friendly name for this keyword', max_length=200)),
|
||||
('description', models.TextField(blank=True, help_text='Description of what this keyword detects')),
|
||||
('keyword_type', models.CharField(choices=[('exact', 'Exact Match'), ('regex', 'Regular Expression'), ('phrase', 'Phrase Match'), ('domain', 'Domain Pattern'), ('email', 'Email Pattern'), ('phone', 'Phone Pattern')], default='phrase', help_text='Type of matching to perform', max_length=20)),
|
||||
('is_active', models.BooleanField(default=True, help_text='Enable/disable this keyword')),
|
||||
('case_sensitive', models.BooleanField(default=False, help_text='Case sensitive matching')),
|
||||
('confidence_score', models.IntegerField(default=50, help_text='Default confidence score (0-100) when this keyword matches')),
|
||||
('auto_approve', models.BooleanField(default=False, help_text='Auto-approve reports matching this keyword (requires high confidence)')),
|
||||
('created_at', models.DateTimeField(auto_now_add=True)),
|
||||
('updated_at', models.DateTimeField(auto_now=True)),
|
||||
('created_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='created_keywords', to=settings.AUTH_USER_MODEL)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'OSINT Keyword',
|
||||
'verbose_name_plural': 'OSINT Keywords',
|
||||
'db_table': 'osint_keyword',
|
||||
'ordering': ['-is_active', 'name'],
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='CrawledContent',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('url', models.URLField(help_text='URL of the crawled page', max_length=1000)),
|
||||
('title', models.CharField(blank=True, help_text='Page title', max_length=500)),
|
||||
('content', models.TextField(help_text='Crawled page content')),
|
||||
('html_content', models.TextField(blank=True, help_text='Raw HTML content')),
|
||||
('match_count', models.IntegerField(default=0, help_text='Number of keyword matches found')),
|
||||
('confidence_score', models.IntegerField(default=0, help_text='Calculated confidence score based on matches')),
|
||||
('has_potential_scam', models.BooleanField(default=False, help_text='Flagged as potential scam based on keyword matches')),
|
||||
('crawled_at', models.DateTimeField(auto_now_add=True)),
|
||||
('http_status', models.IntegerField(blank=True, help_text='HTTP status code', null=True)),
|
||||
('content_hash', models.CharField(blank=True, help_text='SHA256 hash of content for deduplication', max_length=64)),
|
||||
('matched_keywords', models.ManyToManyField(blank=True, help_text='Keywords that matched this content', related_name='matched_contents', to='osint.osintkeyword')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Crawled Content',
|
||||
'verbose_name_plural': 'Crawled Contents',
|
||||
'db_table': 'osint_crawledcontent',
|
||||
'ordering': ['-crawled_at', '-confidence_score'],
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='AutoGeneratedReport',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('title', models.CharField(help_text='Auto-generated report title', max_length=500)),
|
||||
('description', models.TextField(help_text='Auto-generated report description')),
|
||||
('source_url', models.URLField(help_text='Source URL where scam was found', max_length=1000)),
|
||||
('confidence_score', models.IntegerField(default=0, help_text='Confidence score (0-100)')),
|
||||
('status', models.CharField(choices=[('pending', 'Pending Review'), ('approved', 'Approved'), ('rejected', 'Rejected'), ('published', 'Published')], default='pending', help_text='Review status', max_length=20)),
|
||||
('review_notes', models.TextField(blank=True, help_text='Notes from moderator/admin review')),
|
||||
('reviewed_at', models.DateTimeField(blank=True, null=True)),
|
||||
('published_at', models.DateTimeField(blank=True, null=True)),
|
||||
('created_at', models.DateTimeField(auto_now_add=True)),
|
||||
('updated_at', models.DateTimeField(auto_now=True)),
|
||||
('report', models.ForeignKey(blank=True, help_text='Linked scam report (created when approved)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='auto_generated_reports', to='reports.scamreport')),
|
||||
('reviewed_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='reviewed_auto_reports', to=settings.AUTH_USER_MODEL)),
|
||||
('crawled_content', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, related_name='auto_report', to='osint.crawledcontent')),
|
||||
('matched_keywords', models.ManyToManyField(related_name='generated_reports', to='osint.osintkeyword')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Auto-Generated Report',
|
||||
'verbose_name_plural': 'Auto-Generated Reports',
|
||||
'db_table': 'osint_autogeneratedreport',
|
||||
'ordering': ['-created_at', '-confidence_score'],
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='SeedWebsite',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('url', models.URLField(help_text='Base URL to crawl', max_length=500)),
|
||||
('name', models.CharField(help_text='Friendly name for this seed website', max_length=200)),
|
||||
('description', models.TextField(blank=True, help_text='Description of the website')),
|
||||
('is_active', models.BooleanField(default=True, help_text='Enable/disable crawling for this website')),
|
||||
('priority', models.CharField(choices=[('high', 'High'), ('medium', 'Medium'), ('low', 'Low')], default='medium', help_text='Crawling priority', max_length=10)),
|
||||
('crawl_depth', models.IntegerField(default=2, help_text='Maximum depth to crawl (0 = only this page, 1 = this page + direct links, etc.)')),
|
||||
('crawl_interval_hours', models.IntegerField(default=24, help_text='Hours between crawls')),
|
||||
('allowed_domains', models.JSONField(blank=True, default=list, help_text='List of allowed domains to crawl (empty = same domain only)')),
|
||||
('user_agent', models.CharField(blank=True, default='Mozilla/5.0 (compatible; OSINTBot/1.0)', help_text='User agent string for requests', max_length=255)),
|
||||
('last_crawled_at', models.DateTimeField(blank=True, null=True)),
|
||||
('pages_crawled', models.IntegerField(default=0)),
|
||||
('matches_found', models.IntegerField(default=0)),
|
||||
('created_at', models.DateTimeField(auto_now_add=True)),
|
||||
('updated_at', models.DateTimeField(auto_now=True)),
|
||||
('created_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='created_seed_websites', to=settings.AUTH_USER_MODEL)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Seed Website',
|
||||
'verbose_name_plural': 'Seed Websites',
|
||||
'db_table': 'osint_seedwebsite',
|
||||
'ordering': ['-priority', '-last_crawled_at'],
|
||||
},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='crawledcontent',
|
||||
name='seed_website',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='crawled_contents', to='osint.seedwebsite'),
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='osintkeyword',
|
||||
index=models.Index(fields=['is_active', 'keyword_type'], name='osint_keywo_is_acti_6f4814_idx'),
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='autogeneratedreport',
|
||||
index=models.Index(fields=['status', 'confidence_score'], name='osint_autog_status_a8a215_idx'),
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='autogeneratedreport',
|
||||
index=models.Index(fields=['created_at'], name='osint_autog_created_07e2b0_idx'),
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='seedwebsite',
|
||||
index=models.Index(fields=['is_active', 'priority'], name='osint_seedw_is_acti_411fa2_idx'),
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='seedwebsite',
|
||||
index=models.Index(fields=['last_crawled_at'], name='osint_seedw_last_cr_673111_idx'),
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='crawledcontent',
|
||||
index=models.Index(fields=['seed_website', 'crawled_at'], name='osint_crawl_seed_we_eb78f4_idx'),
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='crawledcontent',
|
||||
index=models.Index(fields=['has_potential_scam', 'confidence_score'], name='osint_crawl_has_pot_9317d0_idx'),
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='crawledcontent',
|
||||
index=models.Index(fields=['content_hash'], name='osint_crawl_content_17d05a_idx'),
|
||||
),
|
||||
migrations.AlterUniqueTogether(
|
||||
name='crawledcontent',
|
||||
unique_together={('url', 'content_hash')},
|
||||
),
|
||||
]
|
||||
0
osint/migrations/__init__.py
Normal file
0
osint/migrations/__init__.py
Normal file
468
osint/models.py
Normal file
468
osint/models.py
Normal file
@@ -0,0 +1,468 @@
|
||||
"""
|
||||
OSINT (Open Source Intelligence) integration models.
|
||||
"""
|
||||
from django.db import models
|
||||
from django.contrib.auth import get_user_model
|
||||
from reports.models import ScamReport
|
||||
|
||||
User = get_user_model()
|
||||
|
||||
|
||||
class OSINTTask(models.Model):
|
||||
"""
|
||||
Background tasks for OSINT data collection.
|
||||
"""
|
||||
TASK_TYPE_CHOICES = [
|
||||
('domain_analysis', 'Domain Analysis'),
|
||||
('url_analysis', 'URL Analysis'),
|
||||
('email_analysis', 'Email Analysis'),
|
||||
('phone_analysis', 'Phone Analysis'),
|
||||
('whois_lookup', 'WHOIS Lookup'),
|
||||
('dns_lookup', 'DNS Lookup'),
|
||||
('ssl_check', 'SSL Certificate Check'),
|
||||
('archive_check', 'Archive Check'),
|
||||
('business_registry', 'Business Registry Check'),
|
||||
('social_media', 'Social Media Check'),
|
||||
]
|
||||
|
||||
STATUS_CHOICES = [
|
||||
('pending', 'Pending'),
|
||||
('running', 'Running'),
|
||||
('completed', 'Completed'),
|
||||
('failed', 'Failed'),
|
||||
('cancelled', 'Cancelled'),
|
||||
]
|
||||
|
||||
report = models.ForeignKey(
|
||||
ScamReport,
|
||||
on_delete=models.CASCADE,
|
||||
related_name='osint_tasks'
|
||||
)
|
||||
task_type = models.CharField(
|
||||
max_length=50,
|
||||
choices=TASK_TYPE_CHOICES
|
||||
)
|
||||
status = models.CharField(
|
||||
max_length=20,
|
||||
choices=STATUS_CHOICES,
|
||||
default='pending'
|
||||
)
|
||||
parameters = models.JSONField(
|
||||
default=dict,
|
||||
help_text='Task parameters (e.g., URL, email, phone)'
|
||||
)
|
||||
result = models.JSONField(
|
||||
default=dict,
|
||||
blank=True,
|
||||
help_text='Task result data'
|
||||
)
|
||||
error_message = models.TextField(blank=True)
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
started_at = models.DateTimeField(null=True, blank=True)
|
||||
completed_at = models.DateTimeField(null=True, blank=True)
|
||||
retry_count = models.IntegerField(default=0)
|
||||
|
||||
class Meta:
|
||||
db_table = 'osint_osinttask'
|
||||
verbose_name = 'OSINT Task'
|
||||
verbose_name_plural = 'OSINT Tasks'
|
||||
ordering = ['-created_at']
|
||||
indexes = [
|
||||
models.Index(fields=['status', 'created_at']),
|
||||
models.Index(fields=['report', 'task_type']),
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.get_task_type_display()} for Report #{self.report.id} - {self.get_status_display()}"
|
||||
|
||||
|
||||
class OSINTResult(models.Model):
|
||||
"""
|
||||
OSINT investigation results.
|
||||
"""
|
||||
DATA_TYPE_CHOICES = [
|
||||
('whois', 'WHOIS Data'),
|
||||
('dns', 'DNS Records'),
|
||||
('ssl', 'SSL Certificate'),
|
||||
('archive', 'Archive Data'),
|
||||
('email', 'Email Data'),
|
||||
('phone', 'Phone Data'),
|
||||
('business', 'Business Registry Data'),
|
||||
('social', 'Social Media Data'),
|
||||
('reputation', 'Reputation Data'),
|
||||
]
|
||||
|
||||
report = models.ForeignKey(
|
||||
ScamReport,
|
||||
on_delete=models.CASCADE,
|
||||
related_name='osint_results'
|
||||
)
|
||||
source = models.CharField(
|
||||
max_length=100,
|
||||
help_text='OSINT source/service name'
|
||||
)
|
||||
data_type = models.CharField(
|
||||
max_length=50,
|
||||
choices=DATA_TYPE_CHOICES
|
||||
)
|
||||
raw_data = models.JSONField(
|
||||
default=dict,
|
||||
help_text='Raw data from OSINT source'
|
||||
)
|
||||
processed_data = models.JSONField(
|
||||
default=dict,
|
||||
blank=True,
|
||||
help_text='Processed/cleaned data'
|
||||
)
|
||||
confidence_level = models.IntegerField(
|
||||
default=0,
|
||||
help_text='Confidence level (0-100)'
|
||||
)
|
||||
is_verified = models.BooleanField(
|
||||
default=False,
|
||||
help_text='Manually verified by moderator'
|
||||
)
|
||||
collected_at = models.DateTimeField(auto_now_add=True)
|
||||
updated_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
class Meta:
|
||||
db_table = 'osint_osintresult'
|
||||
verbose_name = 'OSINT Result'
|
||||
verbose_name_plural = 'OSINT Results'
|
||||
ordering = ['-collected_at']
|
||||
indexes = [
|
||||
models.Index(fields=['report', 'data_type']),
|
||||
models.Index(fields=['confidence_level', 'is_verified']),
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.get_data_type_display()} from {self.source} for Report #{self.report.id}"
|
||||
|
||||
|
||||
class OSINTConfiguration(models.Model):
|
||||
"""
|
||||
Configuration for OSINT services and APIs.
|
||||
"""
|
||||
service_name = models.CharField(max_length=100, unique=True)
|
||||
api_key = models.CharField(
|
||||
max_length=255,
|
||||
blank=True,
|
||||
help_text='Encrypted API key'
|
||||
)
|
||||
api_url = models.URLField(blank=True)
|
||||
is_active = models.BooleanField(default=True)
|
||||
rate_limit = models.IntegerField(
|
||||
default=100,
|
||||
help_text='Requests per hour'
|
||||
)
|
||||
configuration = models.JSONField(
|
||||
default=dict,
|
||||
blank=True,
|
||||
help_text='Additional configuration'
|
||||
)
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
updated_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
class Meta:
|
||||
db_table = 'osint_osintconfiguration'
|
||||
verbose_name = 'OSINT Configuration'
|
||||
verbose_name_plural = 'OSINT Configurations'
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.service_name} ({'Active' if self.is_active else 'Inactive'})"
|
||||
|
||||
|
||||
class SeedWebsite(models.Model):
|
||||
"""
|
||||
Seed websites for OSINT crawling.
|
||||
"""
|
||||
PRIORITY_CHOICES = [
|
||||
('high', 'High'),
|
||||
('medium', 'Medium'),
|
||||
('low', 'Low'),
|
||||
]
|
||||
|
||||
url = models.URLField(
|
||||
max_length=500,
|
||||
help_text='Base URL to crawl'
|
||||
)
|
||||
name = models.CharField(
|
||||
max_length=200,
|
||||
help_text='Friendly name for this seed website'
|
||||
)
|
||||
description = models.TextField(
|
||||
blank=True,
|
||||
help_text='Description of the website'
|
||||
)
|
||||
is_active = models.BooleanField(
|
||||
default=True,
|
||||
help_text='Enable/disable crawling for this website'
|
||||
)
|
||||
priority = models.CharField(
|
||||
max_length=10,
|
||||
choices=PRIORITY_CHOICES,
|
||||
default='medium',
|
||||
help_text='Crawling priority'
|
||||
)
|
||||
crawl_depth = models.IntegerField(
|
||||
default=2,
|
||||
help_text='Maximum depth to crawl (0 = only this page, 1 = this page + direct links, etc.)'
|
||||
)
|
||||
crawl_interval_hours = models.IntegerField(
|
||||
default=24,
|
||||
help_text='Hours between crawls'
|
||||
)
|
||||
allowed_domains = models.JSONField(
|
||||
default=list,
|
||||
blank=True,
|
||||
help_text='List of allowed domains to crawl (empty = same domain only)'
|
||||
)
|
||||
user_agent = models.CharField(
|
||||
max_length=255,
|
||||
blank=True,
|
||||
default='Mozilla/5.0 (compatible; OSINTBot/1.0)',
|
||||
help_text='User agent string for requests'
|
||||
)
|
||||
last_crawled_at = models.DateTimeField(null=True, blank=True)
|
||||
pages_crawled = models.IntegerField(default=0)
|
||||
matches_found = models.IntegerField(default=0)
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
updated_at = models.DateTimeField(auto_now=True)
|
||||
created_by = models.ForeignKey(
|
||||
User,
|
||||
on_delete=models.SET_NULL,
|
||||
null=True,
|
||||
related_name='created_seed_websites'
|
||||
)
|
||||
|
||||
class Meta:
|
||||
db_table = 'osint_seedwebsite'
|
||||
verbose_name = 'Seed Website'
|
||||
verbose_name_plural = 'Seed Websites'
|
||||
ordering = ['-priority', '-last_crawled_at']
|
||||
indexes = [
|
||||
models.Index(fields=['is_active', 'priority']),
|
||||
models.Index(fields=['last_crawled_at']),
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name} ({self.url})"
|
||||
|
||||
|
||||
class OSINTKeyword(models.Model):
|
||||
"""
|
||||
Keywords and patterns to search for during OSINT crawling.
|
||||
"""
|
||||
TYPE_CHOICES = [
|
||||
('exact', 'Exact Match'),
|
||||
('regex', 'Regular Expression'),
|
||||
('phrase', 'Phrase Match'),
|
||||
('domain', 'Domain Pattern'),
|
||||
('email', 'Email Pattern'),
|
||||
('phone', 'Phone Pattern'),
|
||||
]
|
||||
|
||||
keyword = models.CharField(
|
||||
max_length=500,
|
||||
help_text='Keyword, phrase, or regex pattern to search for'
|
||||
)
|
||||
name = models.CharField(
|
||||
max_length=200,
|
||||
help_text='Friendly name for this keyword'
|
||||
)
|
||||
description = models.TextField(
|
||||
blank=True,
|
||||
help_text='Description of what this keyword detects'
|
||||
)
|
||||
keyword_type = models.CharField(
|
||||
max_length=20,
|
||||
choices=TYPE_CHOICES,
|
||||
default='phrase',
|
||||
help_text='Type of matching to perform'
|
||||
)
|
||||
is_active = models.BooleanField(
|
||||
default=True,
|
||||
help_text='Enable/disable this keyword'
|
||||
)
|
||||
case_sensitive = models.BooleanField(
|
||||
default=False,
|
||||
help_text='Case sensitive matching'
|
||||
)
|
||||
confidence_score = models.IntegerField(
|
||||
default=50,
|
||||
help_text='Default confidence score (0-100) when this keyword matches'
|
||||
)
|
||||
auto_approve = models.BooleanField(
|
||||
default=False,
|
||||
help_text='Auto-approve reports matching this keyword (requires high confidence)'
|
||||
)
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
updated_at = models.DateTimeField(auto_now=True)
|
||||
created_by = models.ForeignKey(
|
||||
User,
|
||||
on_delete=models.SET_NULL,
|
||||
null=True,
|
||||
related_name='created_keywords'
|
||||
)
|
||||
|
||||
class Meta:
|
||||
db_table = 'osint_keyword'
|
||||
verbose_name = 'OSINT Keyword'
|
||||
verbose_name_plural = 'OSINT Keywords'
|
||||
ordering = ['-is_active', 'name']
|
||||
indexes = [
|
||||
models.Index(fields=['is_active', 'keyword_type']),
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name} ({self.keyword_type})"
|
||||
|
||||
|
||||
class CrawledContent(models.Model):
|
||||
"""
|
||||
Content crawled from seed websites.
|
||||
"""
|
||||
seed_website = models.ForeignKey(
|
||||
SeedWebsite,
|
||||
on_delete=models.CASCADE,
|
||||
related_name='crawled_contents'
|
||||
)
|
||||
url = models.URLField(
|
||||
max_length=1000,
|
||||
help_text='URL of the crawled page'
|
||||
)
|
||||
title = models.CharField(
|
||||
max_length=500,
|
||||
blank=True,
|
||||
help_text='Page title'
|
||||
)
|
||||
content = models.TextField(
|
||||
help_text='Crawled page content'
|
||||
)
|
||||
html_content = models.TextField(
|
||||
blank=True,
|
||||
help_text='Raw HTML content'
|
||||
)
|
||||
matched_keywords = models.ManyToManyField(
|
||||
OSINTKeyword,
|
||||
blank=True,
|
||||
related_name='matched_contents',
|
||||
help_text='Keywords that matched this content'
|
||||
)
|
||||
match_count = models.IntegerField(
|
||||
default=0,
|
||||
help_text='Number of keyword matches found'
|
||||
)
|
||||
confidence_score = models.IntegerField(
|
||||
default=0,
|
||||
help_text='Calculated confidence score based on matches'
|
||||
)
|
||||
has_potential_scam = models.BooleanField(
|
||||
default=False,
|
||||
help_text='Flagged as potential scam based on keyword matches'
|
||||
)
|
||||
crawled_at = models.DateTimeField(auto_now_add=True)
|
||||
http_status = models.IntegerField(
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text='HTTP status code'
|
||||
)
|
||||
content_hash = models.CharField(
|
||||
max_length=64,
|
||||
blank=True,
|
||||
help_text='SHA256 hash of content for deduplication'
|
||||
)
|
||||
|
||||
class Meta:
|
||||
db_table = 'osint_crawledcontent'
|
||||
verbose_name = 'Crawled Content'
|
||||
verbose_name_plural = 'Crawled Contents'
|
||||
ordering = ['-crawled_at', '-confidence_score']
|
||||
indexes = [
|
||||
models.Index(fields=['seed_website', 'crawled_at']),
|
||||
models.Index(fields=['has_potential_scam', 'confidence_score']),
|
||||
models.Index(fields=['content_hash']),
|
||||
]
|
||||
unique_together = [['url', 'content_hash']]
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.title or self.url} - {self.match_count} matches"
|
||||
|
||||
|
||||
class AutoGeneratedReport(models.Model):
|
||||
"""
|
||||
Automatically generated scam reports from OSINT crawling.
|
||||
"""
|
||||
STATUS_CHOICES = [
|
||||
('pending', 'Pending Review'),
|
||||
('approved', 'Approved'),
|
||||
('rejected', 'Rejected'),
|
||||
('published', 'Published'),
|
||||
]
|
||||
|
||||
crawled_content = models.OneToOneField(
|
||||
CrawledContent,
|
||||
on_delete=models.CASCADE,
|
||||
related_name='auto_report'
|
||||
)
|
||||
report = models.ForeignKey(
|
||||
ScamReport,
|
||||
on_delete=models.SET_NULL,
|
||||
null=True,
|
||||
blank=True,
|
||||
related_name='auto_generated_reports',
|
||||
help_text='Linked scam report (created when approved)'
|
||||
)
|
||||
title = models.CharField(
|
||||
max_length=500,
|
||||
help_text='Auto-generated report title'
|
||||
)
|
||||
description = models.TextField(
|
||||
help_text='Auto-generated report description'
|
||||
)
|
||||
source_url = models.URLField(
|
||||
max_length=1000,
|
||||
help_text='Source URL where scam was found'
|
||||
)
|
||||
matched_keywords = models.ManyToManyField(
|
||||
OSINTKeyword,
|
||||
related_name='generated_reports'
|
||||
)
|
||||
confidence_score = models.IntegerField(
|
||||
default=0,
|
||||
help_text='Confidence score (0-100)'
|
||||
)
|
||||
status = models.CharField(
|
||||
max_length=20,
|
||||
choices=STATUS_CHOICES,
|
||||
default='pending',
|
||||
help_text='Review status'
|
||||
)
|
||||
review_notes = models.TextField(
|
||||
blank=True,
|
||||
help_text='Notes from moderator/admin review'
|
||||
)
|
||||
reviewed_by = models.ForeignKey(
|
||||
User,
|
||||
on_delete=models.SET_NULL,
|
||||
null=True,
|
||||
blank=True,
|
||||
related_name='reviewed_auto_reports'
|
||||
)
|
||||
reviewed_at = models.DateTimeField(null=True, blank=True)
|
||||
published_at = models.DateTimeField(null=True, blank=True)
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
updated_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
class Meta:
|
||||
db_table = 'osint_autogeneratedreport'
|
||||
verbose_name = 'Auto-Generated Report'
|
||||
verbose_name_plural = 'Auto-Generated Reports'
|
||||
ordering = ['-created_at', '-confidence_score']
|
||||
indexes = [
|
||||
models.Index(fields=['status', 'confidence_score']),
|
||||
models.Index(fields=['created_at']),
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.title} - {self.get_status_display()}"
|
||||
75
osint/tasks.py
Normal file
75
osint/tasks.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""
|
||||
Celery tasks for OSINT crawling.
|
||||
"""
|
||||
from celery import shared_task
|
||||
from django.core.management import call_command
|
||||
from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
from .models import SeedWebsite, AutoGeneratedReport
|
||||
|
||||
|
||||
@shared_task
|
||||
def crawl_osint_seeds():
|
||||
"""
|
||||
Periodic task to crawl all due seed websites.
|
||||
This should be scheduled to run periodically (e.g., every hour).
|
||||
"""
|
||||
try:
|
||||
call_command('crawl_osint', '--all', verbosity=0)
|
||||
return "OSINT crawling completed successfully"
|
||||
except Exception as e:
|
||||
return f"OSINT crawling failed: {str(e)}"
|
||||
|
||||
|
||||
@shared_task
|
||||
def crawl_specific_seed(seed_id):
|
||||
"""
|
||||
Crawl a specific seed website.
|
||||
"""
|
||||
try:
|
||||
call_command('crawl_osint', '--seed-id', str(seed_id), verbosity=0)
|
||||
return f"Seed website {seed_id} crawled successfully"
|
||||
except Exception as e:
|
||||
return f"Seed website {seed_id} crawling failed: {str(e)}"
|
||||
|
||||
|
||||
@shared_task
|
||||
def auto_approve_high_confidence_reports():
|
||||
"""
|
||||
Auto-approve reports with very high confidence scores and auto-approve keywords.
|
||||
"""
|
||||
from reports.models import ScamReport
|
||||
|
||||
# Get auto-reports that should be auto-approved
|
||||
auto_reports = AutoGeneratedReport.objects.filter(
|
||||
status='pending',
|
||||
confidence_score__gte=80
|
||||
).prefetch_related('matched_keywords')
|
||||
|
||||
approved_count = 0
|
||||
for auto_report in auto_reports:
|
||||
# Check if any matched keyword has auto_approve enabled
|
||||
if any(kw.auto_approve for kw in auto_report.matched_keywords.all()):
|
||||
# Approve and publish
|
||||
from osint.views import ApproveAutoReportView
|
||||
# Create report directly
|
||||
report = ScamReport.objects.create(
|
||||
title=auto_report.title,
|
||||
description=auto_report.description,
|
||||
reported_url=auto_report.source_url,
|
||||
scam_type='other',
|
||||
status='verified',
|
||||
verification_score=auto_report.confidence_score,
|
||||
is_public=True,
|
||||
is_anonymous=True,
|
||||
is_auto_discovered=True, # Mark as auto-discovered
|
||||
)
|
||||
|
||||
auto_report.report = report
|
||||
auto_report.status = 'published'
|
||||
auto_report.published_at = timezone.now()
|
||||
auto_report.save()
|
||||
approved_count += 1
|
||||
|
||||
return f"Auto-approved {approved_count} reports"
|
||||
|
||||
3
osint/tests.py
Normal file
3
osint/tests.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
||||
35
osint/urls.py
Normal file
35
osint/urls.py
Normal file
@@ -0,0 +1,35 @@
|
||||
"""
|
||||
URL configuration for osint app.
|
||||
"""
|
||||
from django.urls import path
|
||||
from . import views
|
||||
|
||||
app_name = 'osint'
|
||||
|
||||
urlpatterns = [
|
||||
# Admin Dashboard (Main OSINT Management)
|
||||
path('admin-dashboard/', views.OSINTAdminDashboardView.as_view(), name='admin_dashboard'),
|
||||
|
||||
# Seed Website Management
|
||||
path('admin-dashboard/seeds/add/', views.SeedWebsiteCreateView.as_view(), name='seed_create'),
|
||||
path('admin-dashboard/seeds/<int:pk>/edit/', views.SeedWebsiteUpdateView.as_view(), name='seed_edit'),
|
||||
path('admin-dashboard/seeds/<int:pk>/delete/', views.SeedWebsiteDeleteView.as_view(), name='seed_delete'),
|
||||
|
||||
# Keyword Management
|
||||
path('admin-dashboard/keywords/add/', views.OSINTKeywordCreateView.as_view(), name='keyword_create'),
|
||||
path('admin-dashboard/keywords/<int:pk>/edit/', views.OSINTKeywordUpdateView.as_view(), name='keyword_edit'),
|
||||
path('admin-dashboard/keywords/<int:pk>/delete/', views.OSINTKeywordDeleteView.as_view(), name='keyword_delete'),
|
||||
|
||||
# Crawling Control
|
||||
path('admin-dashboard/start-crawling/', views.StartCrawlingView.as_view(), name='start_crawling'),
|
||||
|
||||
# Legacy/Moderator Views
|
||||
path('tasks/', views.OSINTTaskListView.as_view(), name='task_list'),
|
||||
path('tasks/<int:pk>/', views.OSINTTaskDetailView.as_view(), name='task_detail'),
|
||||
path('results/<int:report_id>/', views.OSINTResultListView.as_view(), name='result_list'),
|
||||
path('auto-reports/', views.AutoReportListView.as_view(), name='auto_report_list'),
|
||||
path('auto-reports/<int:pk>/', views.AutoReportDetailView.as_view(), name='auto_report_detail'),
|
||||
path('auto-reports/<int:pk>/approve/', views.ApproveAutoReportView.as_view(), name='approve_auto_report'),
|
||||
path('auto-reports/<int:pk>/reject/', views.RejectAutoReportView.as_view(), name='reject_auto_report'),
|
||||
]
|
||||
|
||||
346
osint/views.py
Normal file
346
osint/views.py
Normal file
@@ -0,0 +1,346 @@
|
||||
"""
|
||||
Views for osint app.
|
||||
"""
|
||||
from django.shortcuts import get_object_or_404, redirect
|
||||
from django.views.generic import ListView, DetailView, UpdateView, TemplateView, CreateView, DeleteView
|
||||
from django.contrib.auth.mixins import LoginRequiredMixin, UserPassesTestMixin
|
||||
from django.contrib.messages.views import SuccessMessageMixin
|
||||
from django.contrib import messages
|
||||
from django.urls import reverse_lazy
|
||||
from django.utils import timezone
|
||||
from django.db import transaction
|
||||
from django.db.models import Count, Q
|
||||
from django.http import JsonResponse
|
||||
from django.core.management import call_command
|
||||
from django.core.management.base import CommandError
|
||||
import subprocess
|
||||
import threading
|
||||
from reports.models import ScamReport
|
||||
from .models import OSINTTask, OSINTResult, AutoGeneratedReport, SeedWebsite, OSINTKeyword, CrawledContent
|
||||
from .forms import SeedWebsiteForm, OSINTKeywordForm
|
||||
|
||||
|
||||
class ModeratorRequiredMixin(UserPassesTestMixin):
|
||||
"""Mixin to require moderator role."""
|
||||
def test_func(self):
|
||||
return self.request.user.is_authenticated and self.request.user.is_moderator()
|
||||
|
||||
|
||||
class OSINTTaskListView(LoginRequiredMixin, ModeratorRequiredMixin, ListView):
|
||||
"""List OSINT tasks."""
|
||||
model = OSINTTask
|
||||
template_name = 'osint/task_list.html'
|
||||
context_object_name = 'tasks'
|
||||
paginate_by = 50
|
||||
|
||||
def get_queryset(self):
|
||||
status = self.request.GET.get('status', '')
|
||||
queryset = OSINTTask.objects.select_related('report')
|
||||
if status:
|
||||
queryset = queryset.filter(status=status)
|
||||
return queryset.order_by('-created_at')
|
||||
|
||||
|
||||
class OSINTTaskDetailView(LoginRequiredMixin, ModeratorRequiredMixin, DetailView):
|
||||
"""View OSINT task details."""
|
||||
model = OSINTTask
|
||||
template_name = 'osint/task_detail.html'
|
||||
context_object_name = 'task'
|
||||
|
||||
|
||||
class OSINTResultListView(LoginRequiredMixin, ModeratorRequiredMixin, ListView):
|
||||
"""List OSINT results for a report."""
|
||||
model = OSINTResult
|
||||
template_name = 'osint/result_list.html'
|
||||
context_object_name = 'results'
|
||||
|
||||
def get_queryset(self):
|
||||
report = get_object_or_404(ScamReport, pk=self.kwargs['report_id'])
|
||||
return OSINTResult.objects.filter(report=report).order_by('-collected_at')
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
context = super().get_context_data(**kwargs)
|
||||
context['report'] = get_object_or_404(ScamReport, pk=self.kwargs['report_id'])
|
||||
return context
|
||||
|
||||
|
||||
class AutoReportListView(LoginRequiredMixin, ModeratorRequiredMixin, ListView):
|
||||
"""List auto-generated reports for review."""
|
||||
model = AutoGeneratedReport
|
||||
template_name = 'osint/auto_report_list.html'
|
||||
context_object_name = 'auto_reports'
|
||||
paginate_by = 20
|
||||
|
||||
def get_queryset(self):
|
||||
status = self.request.GET.get('status', 'pending')
|
||||
queryset = AutoGeneratedReport.objects.select_related(
|
||||
'crawled_content', 'reviewed_by', 'report'
|
||||
).prefetch_related('matched_keywords')
|
||||
|
||||
if status:
|
||||
queryset = queryset.filter(status=status)
|
||||
|
||||
return queryset.order_by('-confidence_score', '-created_at')
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
context = super().get_context_data(**kwargs)
|
||||
context['pending_count'] = AutoGeneratedReport.objects.filter(status='pending').count()
|
||||
context['approved_count'] = AutoGeneratedReport.objects.filter(status='approved').count()
|
||||
context['published_count'] = AutoGeneratedReport.objects.filter(status='published').count()
|
||||
context['rejected_count'] = AutoGeneratedReport.objects.filter(status='rejected').count()
|
||||
return context
|
||||
|
||||
|
||||
class AutoReportDetailView(LoginRequiredMixin, ModeratorRequiredMixin, DetailView):
|
||||
"""View auto-generated report details."""
|
||||
model = AutoGeneratedReport
|
||||
template_name = 'osint/auto_report_detail.html'
|
||||
context_object_name = 'auto_report'
|
||||
|
||||
def get_queryset(self):
|
||||
return AutoGeneratedReport.objects.select_related(
|
||||
'crawled_content', 'crawled_content__seed_website',
|
||||
'reviewed_by', 'report'
|
||||
).prefetch_related('matched_keywords')
|
||||
|
||||
|
||||
class ApproveAutoReportView(LoginRequiredMixin, ModeratorRequiredMixin, SuccessMessageMixin, UpdateView):
|
||||
"""Approve an auto-generated report."""
|
||||
model = AutoGeneratedReport
|
||||
fields = []
|
||||
template_name = 'osint/approve_auto_report.html'
|
||||
success_message = "Auto-generated report approved successfully!"
|
||||
|
||||
def form_valid(self, form):
|
||||
auto_report = form.instance
|
||||
|
||||
with transaction.atomic():
|
||||
# Update auto report
|
||||
auto_report.status = 'approved'
|
||||
auto_report.reviewed_by = self.request.user
|
||||
auto_report.reviewed_at = timezone.now()
|
||||
auto_report.save()
|
||||
|
||||
# Create the actual scam report
|
||||
from reports.models import ScamReport
|
||||
|
||||
report = ScamReport.objects.create(
|
||||
title=auto_report.title,
|
||||
description=auto_report.description,
|
||||
reported_url=auto_report.source_url,
|
||||
scam_type='other', # Default, can be updated
|
||||
status='verified',
|
||||
verification_score=auto_report.confidence_score,
|
||||
is_public=True,
|
||||
is_anonymous=True, # System-generated
|
||||
is_auto_discovered=True, # Mark as auto-discovered
|
||||
)
|
||||
|
||||
auto_report.report = report
|
||||
auto_report.status = 'published'
|
||||
auto_report.published_at = timezone.now()
|
||||
auto_report.save()
|
||||
|
||||
return super().form_valid(form)
|
||||
|
||||
def get_success_url(self):
|
||||
return reverse_lazy('osint:auto_report_list')
|
||||
|
||||
|
||||
class RejectAutoReportView(LoginRequiredMixin, ModeratorRequiredMixin, SuccessMessageMixin, UpdateView):
|
||||
"""Reject an auto-generated report."""
|
||||
model = AutoGeneratedReport
|
||||
fields = []
|
||||
template_name = 'osint/reject_auto_report.html'
|
||||
success_message = "Auto-generated report rejected."
|
||||
|
||||
def form_valid(self, form):
|
||||
auto_report = form.instance
|
||||
auto_report.status = 'rejected'
|
||||
auto_report.reviewed_by = self.request.user
|
||||
auto_report.reviewed_at = timezone.now()
|
||||
auto_report.review_notes = self.request.POST.get('review_notes', '').strip()
|
||||
auto_report.save()
|
||||
return super().form_valid(form)
|
||||
|
||||
def get_success_url(self):
|
||||
return reverse_lazy('osint:auto_report_list')
|
||||
|
||||
|
||||
class AdminRequiredMixin(UserPassesTestMixin):
|
||||
"""Mixin to require admin role."""
|
||||
def test_func(self):
|
||||
return self.request.user.is_authenticated and self.request.user.is_administrator()
|
||||
|
||||
|
||||
class OSINTAdminDashboardView(LoginRequiredMixin, AdminRequiredMixin, TemplateView):
|
||||
"""Comprehensive OSINT admin dashboard."""
|
||||
template_name = 'osint/admin_dashboard.html'
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
context = super().get_context_data(**kwargs)
|
||||
now = timezone.now()
|
||||
|
||||
# Seed Website Statistics
|
||||
context['total_seeds'] = SeedWebsite.objects.count()
|
||||
context['active_seeds'] = SeedWebsite.objects.filter(is_active=True).count()
|
||||
context['seed_websites'] = SeedWebsite.objects.all().order_by('-priority', '-last_crawled_at')[:10]
|
||||
|
||||
# Keyword Statistics
|
||||
context['total_keywords'] = OSINTKeyword.objects.count()
|
||||
context['active_keywords'] = OSINTKeyword.objects.filter(is_active=True).count()
|
||||
context['keywords'] = OSINTKeyword.objects.all().order_by('-is_active', 'name')[:10]
|
||||
|
||||
# Crawling Statistics
|
||||
context['total_crawled'] = CrawledContent.objects.count()
|
||||
context['potential_scams'] = CrawledContent.objects.filter(has_potential_scam=True).count()
|
||||
context['recent_crawled'] = CrawledContent.objects.order_by('-crawled_at')[:5]
|
||||
|
||||
# Auto-Report Statistics
|
||||
context['pending_reports'] = AutoGeneratedReport.objects.filter(status='pending').count()
|
||||
context['approved_reports'] = AutoGeneratedReport.objects.filter(status='approved').count()
|
||||
context['published_reports'] = AutoGeneratedReport.objects.filter(status='published').count()
|
||||
context['rejected_reports'] = AutoGeneratedReport.objects.filter(status='rejected').count()
|
||||
context['recent_auto_reports'] = AutoGeneratedReport.objects.order_by('-created_at')[:5]
|
||||
|
||||
# Overall Statistics
|
||||
context['total_pages_crawled'] = SeedWebsite.objects.aggregate(
|
||||
total=Count('pages_crawled')
|
||||
)['total'] or 0
|
||||
context['total_matches'] = SeedWebsite.objects.aggregate(
|
||||
total=Count('matches_found')
|
||||
)['total'] or 0
|
||||
|
||||
# Seed websites due for crawling
|
||||
due_seeds = []
|
||||
for seed in SeedWebsite.objects.filter(is_active=True):
|
||||
if not seed.last_crawled_at:
|
||||
due_seeds.append(seed)
|
||||
else:
|
||||
hours_since = (now - seed.last_crawled_at).total_seconds() / 3600
|
||||
if hours_since >= seed.crawl_interval_hours:
|
||||
due_seeds.append(seed)
|
||||
context['due_for_crawling'] = due_seeds[:5]
|
||||
|
||||
return context
|
||||
|
||||
|
||||
class SeedWebsiteCreateView(LoginRequiredMixin, AdminRequiredMixin, SuccessMessageMixin, CreateView):
|
||||
"""Create a new seed website."""
|
||||
model = SeedWebsite
|
||||
form_class = SeedWebsiteForm
|
||||
template_name = 'osint/seed_website_form.html'
|
||||
success_message = "Seed website created successfully!"
|
||||
|
||||
def form_valid(self, form):
|
||||
form.instance.created_by = self.request.user
|
||||
return super().form_valid(form)
|
||||
|
||||
def get_success_url(self):
|
||||
return reverse_lazy('osint:admin_dashboard')
|
||||
|
||||
|
||||
class SeedWebsiteUpdateView(LoginRequiredMixin, AdminRequiredMixin, SuccessMessageMixin, UpdateView):
|
||||
"""Update a seed website."""
|
||||
model = SeedWebsite
|
||||
form_class = SeedWebsiteForm
|
||||
template_name = 'osint/seed_website_form.html'
|
||||
success_message = "Seed website updated successfully!"
|
||||
|
||||
def get_success_url(self):
|
||||
return reverse_lazy('osint:admin_dashboard')
|
||||
|
||||
|
||||
class SeedWebsiteDeleteView(LoginRequiredMixin, AdminRequiredMixin, SuccessMessageMixin, DeleteView):
|
||||
"""Delete a seed website."""
|
||||
model = SeedWebsite
|
||||
template_name = 'osint/seed_website_confirm_delete.html'
|
||||
success_message = "Seed website deleted successfully!"
|
||||
|
||||
def get_success_url(self):
|
||||
return reverse_lazy('osint:admin_dashboard')
|
||||
|
||||
|
||||
class OSINTKeywordCreateView(LoginRequiredMixin, AdminRequiredMixin, SuccessMessageMixin, CreateView):
|
||||
"""Create a new OSINT keyword."""
|
||||
model = OSINTKeyword
|
||||
form_class = OSINTKeywordForm
|
||||
template_name = 'osint/keyword_form.html'
|
||||
success_message = "Keyword created successfully!"
|
||||
|
||||
def form_valid(self, form):
|
||||
form.instance.created_by = self.request.user
|
||||
return super().form_valid(form)
|
||||
|
||||
def get_success_url(self):
|
||||
return reverse_lazy('osint:admin_dashboard')
|
||||
|
||||
|
||||
class OSINTKeywordUpdateView(LoginRequiredMixin, AdminRequiredMixin, SuccessMessageMixin, UpdateView):
|
||||
"""Update an OSINT keyword."""
|
||||
model = OSINTKeyword
|
||||
form_class = OSINTKeywordForm
|
||||
template_name = 'osint/keyword_form.html'
|
||||
success_message = "Keyword updated successfully!"
|
||||
|
||||
def get_success_url(self):
|
||||
return reverse_lazy('osint:admin_dashboard')
|
||||
|
||||
|
||||
class OSINTKeywordDeleteView(LoginRequiredMixin, AdminRequiredMixin, SuccessMessageMixin, DeleteView):
|
||||
"""Delete an OSINT keyword."""
|
||||
model = OSINTKeyword
|
||||
template_name = 'osint/keyword_confirm_delete.html'
|
||||
success_message = "Keyword deleted successfully!"
|
||||
|
||||
def get_success_url(self):
|
||||
return reverse_lazy('osint:admin_dashboard')
|
||||
|
||||
|
||||
class StartCrawlingView(LoginRequiredMixin, AdminRequiredMixin, TemplateView):
|
||||
"""Start OSINT crawling."""
|
||||
template_name = 'osint/start_crawling.html'
|
||||
|
||||
def post(self, request, *args, **kwargs):
|
||||
seed_id = request.POST.get('seed_id')
|
||||
max_pages = request.POST.get('max_pages', 50)
|
||||
delay = request.POST.get('delay', 1.0)
|
||||
|
||||
def run_crawl():
|
||||
import sys
|
||||
import os
|
||||
import django
|
||||
from django.db import connections
|
||||
|
||||
# Ensure Django is set up for this thread
|
||||
django.setup()
|
||||
|
||||
try:
|
||||
if seed_id:
|
||||
call_command('crawl_osint', '--seed-id', str(seed_id),
|
||||
'--max-pages', str(max_pages), '--delay', str(delay), verbosity=1)
|
||||
else:
|
||||
call_command('crawl_osint', '--all',
|
||||
'--max-pages', str(max_pages), '--delay', str(delay), verbosity=1)
|
||||
except Exception as e:
|
||||
# Log error to a file or database for debugging
|
||||
import traceback
|
||||
error_msg = f"Crawling error: {str(e)}\n{traceback.format_exc()}"
|
||||
print(error_msg, file=sys.stderr)
|
||||
# You could also log to a file or database here
|
||||
finally:
|
||||
# Close database connections
|
||||
connections.close_all()
|
||||
|
||||
# Run in background thread
|
||||
thread = threading.Thread(target=run_crawl)
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
|
||||
messages.success(request, f'Crawling started in background. Check results in a few minutes. (Max pages: {max_pages}, Delay: {delay}s)')
|
||||
return redirect('osint:admin_dashboard')
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
context = super().get_context_data(**kwargs)
|
||||
context['seed_websites'] = SeedWebsite.objects.filter(is_active=True)
|
||||
return context
|
||||
Reference in New Issue
Block a user