Updates
This commit is contained in:
422
ETB-API/knowledge_learning/services/knowledge_base_search.py
Normal file
422
ETB-API/knowledge_learning/services/knowledge_base_search.py
Normal file
@@ -0,0 +1,422 @@
|
||||
import logging
|
||||
from typing import Dict, List, Any, Optional
|
||||
from django.db.models import Q, Count
|
||||
from django.utils import timezone
|
||||
# from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
# from sklearn.metrics.pairwise import cosine_similarity
|
||||
import re
|
||||
|
||||
from ..models import KnowledgeBaseArticle, KnowledgeBaseUsage
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class KnowledgeBaseSearchService:
|
||||
"""Service for searching and discovering knowledge base articles"""
|
||||
|
||||
def __init__(self):
|
||||
self.model_version = "v1.0"
|
||||
self.min_similarity_threshold = 0.1
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
article_types: Optional[List[str]] = None,
|
||||
categories: Optional[List[str]] = None,
|
||||
difficulty_levels: Optional[List[str]] = None,
|
||||
limit: int = 20,
|
||||
offset: int = 0
|
||||
) -> Dict[str, Any]:
|
||||
"""Search knowledge base articles with various filters"""
|
||||
|
||||
try:
|
||||
# Build base queryset
|
||||
queryset = KnowledgeBaseArticle.objects.filter(status='PUBLISHED')
|
||||
|
||||
# Apply filters
|
||||
if article_types:
|
||||
queryset = queryset.filter(article_type__in=article_types)
|
||||
|
||||
if categories:
|
||||
queryset = queryset.filter(category__in=categories)
|
||||
|
||||
if difficulty_levels:
|
||||
queryset = queryset.filter(difficulty_level__in=difficulty_levels)
|
||||
|
||||
# Get all matching articles for similarity calculation
|
||||
all_articles = list(queryset)
|
||||
|
||||
if not all_articles:
|
||||
return {
|
||||
'results': [],
|
||||
'total_count': 0,
|
||||
'query': query,
|
||||
'filters': {
|
||||
'article_types': article_types,
|
||||
'categories': categories,
|
||||
'difficulty_levels': difficulty_levels
|
||||
}
|
||||
}
|
||||
|
||||
# Calculate similarity scores
|
||||
articles_with_scores = self._calculate_similarity_scores(query, all_articles)
|
||||
|
||||
# Sort by relevance (combination of similarity and popularity)
|
||||
articles_with_scores.sort(
|
||||
key=lambda x: (x['similarity_score'] * 0.7) + (x['popularity_score'] * 0.3),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
# Apply pagination
|
||||
paginated_articles = articles_with_scores[offset:offset + limit]
|
||||
|
||||
# Format results
|
||||
results = []
|
||||
for article_data in paginated_articles:
|
||||
article = article_data['article']
|
||||
results.append({
|
||||
'id': str(article.id),
|
||||
'title': article.title,
|
||||
'slug': article.slug,
|
||||
'summary': article.summary,
|
||||
'article_type': article.article_type,
|
||||
'category': article.category,
|
||||
'subcategory': article.subcategory,
|
||||
'tags': article.tags,
|
||||
'difficulty_level': article.difficulty_level,
|
||||
'view_count': article.view_count,
|
||||
'created_at': article.created_at.isoformat(),
|
||||
'updated_at': article.updated_at.isoformat(),
|
||||
'author': article.author.username if article.author else None,
|
||||
'similarity_score': article_data['similarity_score'],
|
||||
'relevance_score': article_data['relevance_score'],
|
||||
'popularity_score': article_data['popularity_score'],
|
||||
'matching_keywords': article_data['matching_keywords']
|
||||
})
|
||||
|
||||
return {
|
||||
'results': results,
|
||||
'total_count': len(articles_with_scores),
|
||||
'query': query,
|
||||
'filters': {
|
||||
'article_types': article_types,
|
||||
'categories': categories,
|
||||
'difficulty_levels': difficulty_levels
|
||||
},
|
||||
'pagination': {
|
||||
'limit': limit,
|
||||
'offset': offset,
|
||||
'has_more': (offset + limit) < len(articles_with_scores)
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to search knowledge base: {str(e)}")
|
||||
raise
|
||||
|
||||
def find_related_articles(
|
||||
self,
|
||||
article_id: str,
|
||||
limit: int = 5
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Find articles related to a specific article"""
|
||||
|
||||
try:
|
||||
article = KnowledgeBaseArticle.objects.get(id=article_id)
|
||||
|
||||
# Find articles with similar categories, tags, or content
|
||||
related_articles = KnowledgeBaseArticle.objects.filter(
|
||||
status='PUBLISHED'
|
||||
).exclude(id=article_id).filter(
|
||||
Q(category=article.category) |
|
||||
Q(subcategory=article.subcategory) |
|
||||
Q(tags__overlap=article.tags) |
|
||||
Q(article_type=article.article_type)
|
||||
).distinct()
|
||||
|
||||
if not related_articles.exists():
|
||||
return []
|
||||
|
||||
# Calculate similarity scores
|
||||
article_text = f"{article.title} {article.summary} {' '.join(article.tags)}"
|
||||
articles_with_scores = []
|
||||
|
||||
for related_article in related_articles:
|
||||
related_text = f"{related_article.title} {related_article.summary} {' '.join(related_article.tags)}"
|
||||
similarity = self._calculate_text_similarity(article_text, related_text)
|
||||
|
||||
if similarity >= self.min_similarity_threshold:
|
||||
articles_with_scores.append({
|
||||
'article': related_article,
|
||||
'similarity_score': similarity
|
||||
})
|
||||
|
||||
# Sort by similarity and return top matches
|
||||
articles_with_scores.sort(key=lambda x: x['similarity_score'], reverse=True)
|
||||
|
||||
results = []
|
||||
for article_data in articles_with_scores[:limit]:
|
||||
article = article_data['article']
|
||||
results.append({
|
||||
'id': str(article.id),
|
||||
'title': article.title,
|
||||
'slug': article.slug,
|
||||
'summary': article.summary,
|
||||
'article_type': article.article_type,
|
||||
'category': article.category,
|
||||
'similarity_score': article_data['similarity_score']
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
except KnowledgeBaseArticle.DoesNotExist:
|
||||
raise ValueError(f"Article with ID {article_id} not found")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to find related articles: {str(e)}")
|
||||
raise
|
||||
|
||||
def suggest_articles_for_incident(
|
||||
self,
|
||||
incident_title: str,
|
||||
incident_description: str,
|
||||
incident_category: str,
|
||||
limit: int = 5
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Suggest knowledge base articles for an incident"""
|
||||
|
||||
try:
|
||||
# Build search query from incident data
|
||||
search_query = f"{incident_title} {incident_description} {incident_category}"
|
||||
|
||||
# Search for relevant articles
|
||||
search_results = self.search(
|
||||
query=search_query,
|
||||
categories=[incident_category] if incident_category else None,
|
||||
limit=limit * 2 # Get more results to filter
|
||||
)
|
||||
|
||||
# Filter and rank results
|
||||
relevant_articles = []
|
||||
for result in search_results['results']:
|
||||
# Boost score for category matches
|
||||
category_boost = 0.2 if result['category'] == incident_category else 0.0
|
||||
|
||||
# Boost score for runbooks and troubleshooting guides
|
||||
type_boost = 0.1 if result['article_type'] in ['RUNBOOK', 'TROUBLESHOOTING'] else 0.0
|
||||
|
||||
final_score = result['similarity_score'] + category_boost + type_boost
|
||||
|
||||
if final_score >= self.min_similarity_threshold:
|
||||
relevant_articles.append({
|
||||
**result,
|
||||
'final_score': final_score
|
||||
})
|
||||
|
||||
# Sort by final score and return top matches
|
||||
relevant_articles.sort(key=lambda x: x['final_score'], reverse=True)
|
||||
|
||||
return relevant_articles[:limit]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to suggest articles for incident: {str(e)}")
|
||||
raise
|
||||
|
||||
def get_popular_articles(
|
||||
self,
|
||||
category: Optional[str] = None,
|
||||
article_type: Optional[str] = None,
|
||||
limit: int = 10
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Get popular articles based on view count and recent activity"""
|
||||
|
||||
try:
|
||||
queryset = KnowledgeBaseArticle.objects.filter(status='PUBLISHED')
|
||||
|
||||
if category:
|
||||
queryset = queryset.filter(category=category)
|
||||
|
||||
if article_type:
|
||||
queryset = queryset.filter(article_type=article_type)
|
||||
|
||||
# Get articles ordered by popularity (view count + recent activity)
|
||||
popular_articles = queryset.order_by('-view_count', '-updated_at')[:limit]
|
||||
|
||||
results = []
|
||||
for article in popular_articles:
|
||||
results.append({
|
||||
'id': str(article.id),
|
||||
'title': article.title,
|
||||
'slug': article.slug,
|
||||
'summary': article.summary,
|
||||
'article_type': article.article_type,
|
||||
'category': article.category,
|
||||
'view_count': article.view_count,
|
||||
'updated_at': article.updated_at.isoformat(),
|
||||
'is_featured': article.is_featured
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get popular articles: {str(e)}")
|
||||
raise
|
||||
|
||||
def get_articles_due_for_review(self) -> List[Dict[str, Any]]:
|
||||
"""Get articles that are due for review"""
|
||||
|
||||
try:
|
||||
due_articles = KnowledgeBaseArticle.objects.filter(
|
||||
next_review_due__lt=timezone.now(),
|
||||
status='PUBLISHED'
|
||||
).order_by('next_review_due')
|
||||
|
||||
results = []
|
||||
for article in due_articles:
|
||||
results.append({
|
||||
'id': str(article.id),
|
||||
'title': article.title,
|
||||
'slug': article.slug,
|
||||
'category': article.category,
|
||||
'last_reviewed': article.last_reviewed.isoformat() if article.last_reviewed else None,
|
||||
'next_review_due': article.next_review_due.isoformat(),
|
||||
'maintainer': article.maintainer.username if article.maintainer else None,
|
||||
'days_overdue': (timezone.now() - article.next_review_due).days
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get articles due for review: {str(e)}")
|
||||
raise
|
||||
|
||||
def _calculate_similarity_scores(
|
||||
self,
|
||||
query: str,
|
||||
articles: List[KnowledgeBaseArticle]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Calculate similarity scores for articles against a query"""
|
||||
|
||||
if not articles:
|
||||
return []
|
||||
|
||||
# Prepare texts for similarity calculation
|
||||
query_text = self._preprocess_text(query)
|
||||
article_texts = []
|
||||
|
||||
for article in articles:
|
||||
article_text = f"{article.title} {article.summary} {' '.join(article.tags)} {' '.join(article.search_keywords)}"
|
||||
article_texts.append(self._preprocess_text(article_text))
|
||||
|
||||
# Calculate similarity using simple keyword matching (fallback)
|
||||
try:
|
||||
similarities = [self._calculate_keyword_similarity(query, article_text) for article_text in article_texts]
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to calculate similarity: {str(e)}")
|
||||
similarities = [0.0] * len(article_texts)
|
||||
|
||||
# Prepare results with additional scoring
|
||||
results = []
|
||||
for i, article in enumerate(articles):
|
||||
similarity_score = float(similarities[i])
|
||||
popularity_score = self._calculate_popularity_score(article)
|
||||
relevance_score = (similarity_score * 0.7) + (popularity_score * 0.3)
|
||||
|
||||
matching_keywords = self._find_matching_keywords(query, article)
|
||||
|
||||
results.append({
|
||||
'article': article,
|
||||
'similarity_score': similarity_score,
|
||||
'popularity_score': popularity_score,
|
||||
'relevance_score': relevance_score,
|
||||
'matching_keywords': matching_keywords
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
def _calculate_text_similarity(self, text1: str, text2: str) -> float:
|
||||
"""Calculate text similarity using simple keyword matching (fallback)"""
|
||||
try:
|
||||
if not text1.strip() or not text2.strip():
|
||||
return 0.0
|
||||
|
||||
# Simple keyword-based similarity as fallback
|
||||
words1 = set(text1.lower().split())
|
||||
words2 = set(text2.lower().split())
|
||||
|
||||
if not words1 or not words2:
|
||||
return 0.0
|
||||
|
||||
intersection = words1.intersection(words2)
|
||||
union = words1.union(words2)
|
||||
|
||||
return len(intersection) / len(union) if union else 0.0
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to calculate text similarity: {str(e)}")
|
||||
return 0.0
|
||||
|
||||
def _calculate_keyword_similarity(self, query: str, article_text: str) -> float:
|
||||
"""Fallback similarity calculation using keyword matching"""
|
||||
query_words = set(self._extract_keywords(query.lower()))
|
||||
article_words = set(self._extract_keywords(article_text.lower()))
|
||||
|
||||
if not query_words or not article_words:
|
||||
return 0.0
|
||||
|
||||
intersection = query_words.intersection(article_words)
|
||||
union = query_words.union(article_words)
|
||||
|
||||
return len(intersection) / len(union) if union else 0.0
|
||||
|
||||
def _calculate_popularity_score(self, article: KnowledgeBaseArticle) -> float:
|
||||
"""Calculate popularity score based on views and recency"""
|
||||
# Normalize view count (assuming max views is around 1000)
|
||||
view_score = min(article.view_count / 1000.0, 1.0)
|
||||
|
||||
# Calculate recency score (more recent = higher score)
|
||||
days_since_update = (timezone.now() - article.updated_at).days
|
||||
recency_score = max(0, 1 - (days_since_update / 365.0)) # Decay over a year
|
||||
|
||||
# Featured articles get a boost
|
||||
featured_boost = 0.1 if article.is_featured else 0.0
|
||||
|
||||
return (view_score * 0.6) + (recency_score * 0.3) + featured_boost
|
||||
|
||||
def _preprocess_text(self, text: str) -> str:
|
||||
"""Preprocess text for similarity calculation"""
|
||||
# Convert to lowercase
|
||||
text = text.lower()
|
||||
|
||||
# Remove special characters but keep spaces
|
||||
text = re.sub(r'[^\w\s]', ' ', text)
|
||||
|
||||
# Remove extra whitespace
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
return text
|
||||
|
||||
def _extract_keywords(self, text: str) -> List[str]:
|
||||
"""Extract keywords from text"""
|
||||
# Simple keyword extraction - in production, you might use more sophisticated methods
|
||||
words = text.split()
|
||||
|
||||
# Filter out common stop words
|
||||
stop_words = {
|
||||
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
||||
'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have',
|
||||
'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should'
|
||||
}
|
||||
|
||||
keywords = [word for word in words if len(word) > 2 and word not in stop_words]
|
||||
return keywords
|
||||
|
||||
def _find_matching_keywords(self, query: str, article: KnowledgeBaseArticle) -> List[str]:
|
||||
"""Find keywords that match between query and article"""
|
||||
query_keywords = set(self._extract_keywords(query.lower()))
|
||||
|
||||
# Check article title, summary, tags, and search keywords
|
||||
article_text = f"{article.title} {article.summary} {' '.join(article.tags)} {' '.join(article.search_keywords)}"
|
||||
article_keywords = set(self._extract_keywords(article_text.lower()))
|
||||
|
||||
matching_keywords = list(query_keywords.intersection(article_keywords))
|
||||
return matching_keywords[:5] # Return top 5 matches
|
||||
Reference in New Issue
Block a user