Coverage for src / dataknobs_bots / knowledge / query / expander.py: 21%

73 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-16 10:13 -0700

1"""Contextual query expansion using conversation history. 

2 

3This module provides query expansion without requiring LLM calls, 

4using recent conversation context to enrich ambiguous queries. 

5""" 

6 

7from __future__ import annotations 

8 

9from dataclasses import dataclass 

10from typing import Any, Callable 

11 

12 

13@dataclass 

14class Message: 

15 """A conversation message. 

16 

17 Attributes: 

18 role: Message role ("user", "assistant", "system") 

19 content: Message content 

20 """ 

21 

22 role: str 

23 content: str 

24 

25 

26class ContextualExpander: 

27 """Expands queries using conversation context. 

28 

29 This expander enriches ambiguous or context-dependent queries 

30 by incorporating information from recent conversation turns. 

31 Unlike QueryTransformer, it doesn't require LLM calls. 

32 

33 Example: 

34 ```python 

35 expander = ContextualExpander(max_context_turns=3) 

36 

37 # User asks: "Show me an example" 

38 # Recent context: discussing chain-of-thought prompting 

39 expanded = expander.expand( 

40 "Show me an example", 

41 conversation_history 

42 ) 

43 # Returns: "chain-of-thought prompting examples Show me an example" 

44 ``` 

45 """ 

46 

47 def __init__( 

48 self, 

49 max_context_turns: int = 3, 

50 include_assistant: bool = False, 

51 keyword_weight: int = 2, 

52 ): 

53 """Initialize the contextual expander. 

54 

55 Args: 

56 max_context_turns: Maximum conversation turns to consider 

57 include_assistant: Whether to include assistant messages 

58 keyword_weight: How many times to repeat extracted keywords 

59 """ 

60 self.max_context_turns = max_context_turns 

61 self.include_assistant = include_assistant 

62 self.keyword_weight = keyword_weight 

63 

64 # Common words to filter out 

65 self._stop_words = { 

66 "the", "a", "an", "is", "are", "was", "were", "be", "been", "being", 

67 "have", "has", "had", "do", "does", "did", "will", "would", "could", 

68 "should", "may", "might", "must", "can", "this", "that", "these", 

69 "those", "i", "you", "he", "she", "it", "we", "they", "what", "which", 

70 "who", "when", "where", "why", "how", "all", "each", "every", "both", 

71 "few", "more", "most", "other", "some", "such", "no", "not", "only", 

72 "own", "same", "so", "than", "too", "very", "just", "also", "now", 

73 "here", "there", "about", "into", "through", "during", "before", 

74 "after", "above", "below", "to", "from", "up", "down", "in", "out", 

75 "on", "off", "over", "under", "again", "further", "then", "once", 

76 "and", "but", "or", "nor", "for", "yet", "because", "as", "until", 

77 "while", "of", "at", "by", "with", "without", "between", "me", "my", 

78 "your", "his", "her", "its", "our", "their", "please", "help", "want", 

79 "need", "like", "show", "tell", "give", "make", "let", "get", "see", 

80 } 

81 

82 def expand( 

83 self, 

84 user_input: str, 

85 conversation_history: list[Message] | list[dict[str, Any]], 

86 ) -> str: 

87 """Expand query with conversation context. 

88 

89 Args: 

90 user_input: The user's current message 

91 conversation_history: Recent conversation messages 

92 

93 Returns: 

94 Expanded query string 

95 """ 

96 # Normalize conversation history to Message objects 

97 messages = self._normalize_messages(conversation_history) 

98 

99 # Get recent context 

100 recent = self._get_recent_context(messages) 

101 

102 # Extract keywords from context 

103 keywords = self._extract_keywords(recent) 

104 

105 # Build expanded query 

106 if keywords: 

107 keyword_str = " ".join(keywords) 

108 return f"{keyword_str} {user_input}" 

109 

110 return user_input 

111 

112 def _normalize_messages( 

113 self, 

114 history: list[Message] | list[dict[str, Any]], 

115 ) -> list[Message]: 

116 """Normalize history to Message objects. 

117 

118 Args: 

119 history: Conversation history in various formats 

120 

121 Returns: 

122 List of Message objects 

123 """ 

124 messages = [] 

125 for item in history: 

126 if isinstance(item, Message): 

127 messages.append(item) 

128 elif isinstance(item, dict): 

129 messages.append(Message( 

130 role=item.get("role", "user"), 

131 content=item.get("content", ""), 

132 )) 

133 return messages 

134 

135 def _get_recent_context(self, messages: list[Message]) -> list[str]: 

136 """Get recent relevant context from conversation. 

137 

138 Args: 

139 messages: Conversation messages 

140 

141 Returns: 

142 List of context strings 

143 """ 

144 context = [] 

145 count = 0 

146 

147 # Walk backwards through messages 

148 for msg in reversed(messages): 

149 if count >= self.max_context_turns: 

150 break 

151 

152 if msg.role == "user" or (msg.role == "assistant" and self.include_assistant): 

153 context.insert(0, msg.content) 

154 count += 1 

155 

156 return context 

157 

158 def _extract_keywords(self, context: list[str]) -> list[str]: 

159 """Extract meaningful keywords from context. 

160 

161 Args: 

162 context: List of context strings 

163 

164 Returns: 

165 List of extracted keywords 

166 """ 

167 # Combine all context 

168 combined = " ".join(context) 

169 

170 # Tokenize and filter 

171 words = combined.lower().split() 

172 keywords = [] 

173 

174 for word in words: 

175 # Clean punctuation 

176 cleaned = word.strip(".,!?\"'()[]{}:;") 

177 

178 # Skip short words, stop words, and numbers 

179 if ( 

180 len(cleaned) < 3 

181 or cleaned in self._stop_words 

182 or cleaned.isdigit() 

183 ): 

184 continue 

185 

186 # Add keyword if not already present 

187 if cleaned not in keywords: 

188 keywords.append(cleaned) 

189 

190 # Return top keywords (most recent first gives natural weighting) 

191 return keywords[:5] 

192 

193 def expand_with_topics( 

194 self, 

195 user_input: str, 

196 conversation_history: list[Message] | list[dict[str, Any]], 

197 topic_extractor: Callable[[str], list[str]] | None = None, 

198 ) -> str: 

199 """Expand query with extracted topics. 

200 

201 Enhanced expansion that uses a custom topic extractor. 

202 

203 Args: 

204 user_input: The user's current message 

205 conversation_history: Recent conversation messages 

206 topic_extractor: Optional function to extract topics from text 

207 

208 Returns: 

209 Expanded query string 

210 """ 

211 messages = self._normalize_messages(conversation_history) 

212 recent = self._get_recent_context(messages) 

213 

214 if topic_extractor: 

215 # Use custom topic extraction 

216 topics = [] 

217 for text in recent: 

218 topics.extend(topic_extractor(text)) 

219 topic_str = " ".join(topics[:5]) 

220 else: 

221 # Fall back to keyword extraction 

222 keywords = self._extract_keywords(recent) 

223 topic_str = " ".join(keywords) 

224 

225 if topic_str: 

226 return f"{topic_str} {user_input}" 

227 

228 return user_input 

229 

230 

231def is_ambiguous_query(query: str) -> bool: 

232 """Check if a query is likely ambiguous and needs expansion. 

233 

234 Args: 

235 query: The query to check 

236 

237 Returns: 

238 True if query appears ambiguous 

239 

240 Example: 

241 ```python 

242 is_ambiguous_query("Show me an example") # True 

243 is_ambiguous_query("How do I configure OAuth?") # False 

244 ``` 

245 """ 

246 # Short queries are often ambiguous 

247 words = query.split() 

248 if len(words) < 4: 

249 return True 

250 

251 # Queries with demonstratives are often context-dependent 

252 ambiguous_patterns = [ 

253 "this", "that", "these", "those", "it", "them", 

254 "example", "more", "another", "same", "similar", 

255 ] 

256 

257 query_lower = query.lower() 

258 for pattern in ambiguous_patterns: 

259 if pattern in query_lower: 

260 return True 

261 

262 return False