moss-samples/python/comprehensive_sample.py at main · CoderOMaster/moss-samples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
"""
Comprehensive End-to-End Moss SDK Sample

This example demonstrates the complete workflow of the Moss Python SDK, showcasing ALL available functions:
- Index creation and management
- Document operations (add, retrieve, update, delete)
- Semantic search functionality
- Advanced querying with metadata filtering
- Index lifecycle management
- Error handling best practices

The sample uses a dynamic index name based on current timestamp to avoid conflicts.
"""

import asyncio
import os
from typing import List
from dotenv import load_dotenv
from datetime import datetime
from inferedge_moss import (
    DocumentInfo,
    GetDocumentsOptions,
    MossClient,
    MutationOptions,
    QueryOptions,
)

# Load environment variables
load_dotenv()


async def comprehensive_moss_example():
    """
    Complete end-to-end example demonstrating ALL Moss SDK functionality.

    This comprehensive sample covers:
    1. Client initialization
    2. Index creation with initial documents
    3. Index information retrieval
    4. Index listing
    5. Document addition with upsert
    6. Document retrieval (all and specific)
    7. Index loading for querying
    8. Semantic search operations
    9. Document deletion
    10. Index cleanup
    11. Error handling throughout
    """
    print("Comprehensive Moss SDK End-to-End Sample")
    print("=" * 60)

    # Initialize client with project credentials from environment
    project_id = os.getenv("MOSS_PROJECT_ID")
    project_key = os.getenv("MOSS_PROJECT_KEY")

    if not project_id or not project_key:
        print("❌ Error: Missing environment variables!")
        print("Please set MOSS_PROJECT_ID and MOSS_PROJECT_KEY in .env file")
        print("Copy .env.template to .env and fill in your credentials")
        return

    client = MossClient(project_id, project_key)

    # Create comprehensive document collection with rich metadata
    documents: List[DocumentInfo] = [
        DocumentInfo(
            id="tech-ai-001",
            text="Artificial Intelligence (AI) is transforming industries by enabling machines to perform tasks that typically require human intelligence. From healthcare diagnostics to autonomous vehicles, AI applications are revolutionizing how we work and live.",
            metadata={
                "category": "technology",
                "subcategory": "artificial_intelligence",
                "difficulty": "beginner",
                "topic": "ai_overview",
                "author": "Tech Research Team",
                "tags": "ai,technology,automation,machine_learning",
                "word_count": "42",
                "reading_time": "1 minute",
                "published_date": "2024-01-15",
                "language": "en"
            }
        ),
        DocumentInfo(
            id="tech-ml-002",
            text="Machine Learning is a subset of AI that enables systems to automatically learn and improve from experience without being explicitly programmed. It uses algorithms to analyze data, identify patterns, and make predictions or decisions.",
            metadata={
                "category": "technology",
                "subcategory": "machine_learning",
                "difficulty": "intermediate",
                "topic": "ml_fundamentals",
                "author": "ML Engineering Team",
                "tags": "machine_learning,algorithms,data_science,predictions",
                "word_count": "38",
                "reading_time": "1 minute",
                "published_date": "2024-01-20",
                "language": "en"
            }
        ),
        DocumentInfo(
            id="tech-dl-003",
            text="Deep Learning uses artificial neural networks with multiple layers to model and understand complex patterns in data. It has achieved breakthrough results in image recognition, natural language processing, and game playing.",
            metadata={
                "category": "technology",
                "subcategory": "deep_learning",
                "difficulty": "advanced",
                "topic": "neural_networks",
                "author": "Deep Learning Lab",
                "tags": "deep_learning,neural_networks,image_recognition,nlp",
                "word_count": "35",
                "reading_time": "1 minute",
                "published_date": "2024-01-25",
                "language": "en"
            }
        ),
        DocumentInfo(
            id="tech-nlp-004",
            text="Natural Language Processing (NLP) enables computers to understand, interpret, and generate human language. Applications include chatbots, language translation, sentiment analysis, and text summarization.",
            metadata={
                "category": "technology",
                "subcategory": "natural_language_processing",
                "difficulty": "intermediate",
                "topic": "language_processing",
                "author": "NLP Research Group",
                "tags": "nlp,language,chatbots,translation,sentiment_analysis",
                "word_count": "31",
                "reading_time": "1 minute",
                "published_date": "2024-02-01",
                "language": "en"
            }
        ),
        DocumentInfo(
            id="tech-cv-005",
            text="Computer Vision allows machines to interpret and understand visual information from the world. It powers applications like facial recognition, medical image analysis, autonomous driving, and quality control in manufacturing.",
            metadata={
                "category": "technology",
                "subcategory": "computer_vision",
                "difficulty": "intermediate",
                "topic": "visual_recognition",
                "author": "Computer Vision Team",
                "tags": "computer_vision,image_processing,facial_recognition,autonomous_driving",
                "word_count": "33",
                "reading_time": "1 minute",
                "published_date": "2024-02-05",
                "language": "en"
            }
        ),
        DocumentInfo(
            id="business-data-006",
            text="Data Science combines statistics, programming, and domain expertise to extract actionable insights from data. It involves data collection, cleaning, analysis, and visualization to support business decision-making.",
            metadata={
                "category": "business",
                "subcategory": "data_science",
                "difficulty": "intermediate",
                "topic": "analytics",
                "author": "Data Analytics Team",
                "tags": "data_science,statistics,analytics,business_intelligence",
                "word_count": "32",
                "reading_time": "1 minute",
                "published_date": "2024-02-10",
                "language": "en"
            }
        ),
        DocumentInfo(
            id="business-cloud-007",
            text="Cloud Computing provides on-demand access to computing resources over the internet, including servers, storage, databases, and software. It offers scalability, cost-efficiency, and global accessibility for businesses.",
            metadata={
                "category": "business",
                "subcategory": "cloud_computing",
                "difficulty": "beginner",
                "topic": "infrastructure",
                "author": "Cloud Architecture Team",
                "tags": "cloud_computing,infrastructure,scalability,saas",
                "word_count": "30",
                "reading_time": "1 minute",
                "published_date": "2024-02-15",
                "language": "en"
            }
        )
    ]

    # Create dynamic index name with timestamp
    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    index_name = f"comprehensive-demo-{timestamp}"

    try:
        print(f"\nStep 1: Creating index '{index_name}' with {len(documents)} documents...")
        created = await client.create_index(index_name, documents, "moss-minilm")
        print(f"Index created successfully (job: {created.job_id}, index: {created.index_name}, docs: {created.doc_count})")

        print("\nStep 2: Retrieving index information...")
        index_info = await client.get_index(index_name)
        print("Index Details:")
        print(f"   - Name: {index_info.name}")
        print(f"   - Document Count: {index_info.doc_count}")
        print(f"   - Model: {index_info.model.id}")
        print(f"   - Status: {index_info.status}")
        print(f"   - Created: {index_info.created_at if hasattr(index_info, 'created_at') else 'N/A'}")

        print("\nStep 3: Listing all available indexes...")
        indexes = await client.list_indexes()
        print(f"Found {len(indexes)} total indexes:")
        for idx in indexes:
            print(f"   - {idx.name}: {idx.doc_count} docs, status: {idx.status}")

        print("\nStep 4: Adding additional documents with upsert...")
        additional_docs: List[DocumentInfo] = [
            DocumentInfo(
                id="security-cyber-008",
                text="Cybersecurity protects digital systems, networks, and data from cyber threats. It involves implementing security measures, monitoring for vulnerabilities, and responding to incidents to maintain data integrity and privacy.",
                metadata={
                    "category": "security",
                    "subcategory": "cybersecurity",
                    "difficulty": "intermediate",
                    "topic": "digital_security",
                    "author": "Security Team",
                    "tags": "cybersecurity,data_protection,privacy,threats",
                    "word_count": "34",
                    "reading_time": "1 minute",
                    "published_date": "2024-02-20",
                    "language": "en"
                }
            ),
            DocumentInfo(
                id="health-biotech-009",
                text="Biotechnology applies biological processes and organisms to develop products and technologies that improve human health and the environment. It includes genetic engineering, drug development, and personalized medicine.",
                metadata={
                    "category": "healthcare",
                    "subcategory": "biotechnology",
                    "difficulty": "advanced",
                    "topic": "medical_innovation",
                    "author": "Biotech Research Lab",
                    "tags": "biotechnology,genetic_engineering,drug_development,personalized_medicine",
                    "word_count": "33",
                    "reading_time": "1 minute",
                    "published_date": "2024-02-25",
                    "language": "en"
                }
            ),
            DocumentInfo(
                id="env-sustainability-010",
                text="Sustainable Technology focuses on developing solutions that meet present needs without compromising future generations. It includes renewable energy, green computing, and environmentally friendly manufacturing processes.",
                metadata={
                    "category": "environment",
                    "subcategory": "sustainability",
                    "difficulty": "intermediate",
                    "topic": "green_technology",
                    "author": "Sustainability Team",
                    "tags": "sustainability,renewable_energy,green_computing,environment",
                    "word_count": "31",
                    "reading_time": "1 minute",
                    "published_date": "2024-03-01",
                    "language": "en"
                }
            )
        ]

        add_result = await client.add_docs(index_name, additional_docs, MutationOptions(upsert=True))
        print(f"Added {len(additional_docs)} additional documents (job: {add_result.job_id}, docs: {add_result.doc_count})")

        print("\nStep 5: Retrieving all documents from index...")
        all_docs = await client.get_docs(index_name)
        print(f"Total documents in index: {len(all_docs)}")

        # Display sample of documents with metadata
        print("Sample documents preview:")
        for i, doc in enumerate(all_docs[:3]):
            text_preview = doc.text[:80] + "..." if len(doc.text) > 80 else doc.text
            print(f"   {i+1}. [{doc.id}] {text_preview}")
            if doc.metadata:
                category = doc.metadata.get('category', 'N/A')
                difficulty = doc.metadata.get('difficulty', 'N/A')
                print(f"      Category: {category} | Difficulty: {difficulty}")

        print("\nStep 6: Retrieving specific documents by ID...")
        target_doc_ids = ["tech-ai-001", "business-data-006", "security-cyber-008"]
        specific_docs = await client.get_docs(
            index_name,
            GetDocumentsOptions(doc_ids=target_doc_ids)
        )
        print(f"Retrieved {len(specific_docs)} specific documents:")
        for doc in specific_docs:
            text_preview = doc.text[:60] + "..." if len(doc.text) > 60 else doc.text
            print(f"   - [{doc.id}] {text_preview}")
            if doc.metadata:
                tags_str = doc.metadata.get('tags', '')
                tags = tags_str.split(',') if tags_str else []
                print(f"     Tags: {', '.join(tags[:3])}{'...' if len(tags) > 3 else ''}")

        print("\nStep 7: Loading index for semantic search operations...")
        loaded_index = await client.load_index(index_name)
        print(f"Index loaded for querying: {loaded_index}")

        print("\nStep 8: Performing comprehensive semantic search tests...")
        search_queries = [
            ("artificial intelligence and machine learning", 4),
            ("data analysis and business insights", 3),
            ("visual recognition and image processing", 3),
            ("cybersecurity and data protection", 2),
            ("healthcare innovation and biotechnology", 2),
            ("sustainable technology and environment", 2),
        ]

        for i, (query, top_k) in enumerate(search_queries, 1):
            print(f"\n   Search {i}: \"{query}\"")
            options = QueryOptions(top_k=top_k)
            search_results = await client.query(index_name, query, options)

            print(f"   Time taken: {search_results.time_taken_ms} ms")
            print(f"   Found {len(search_results.docs)} results:")

            for j, result in enumerate(search_results.docs, 1):
                text_preview = result.text[:70] + "..." if len(result.text) > 70 else result.text
                print(f"      {j}. [{result.id}] Score: {result.score:.3f}")
                print(f"         {text_preview}")
                if result.metadata:
                    category = result.metadata.get('category', 'N/A')
                    topic = result.metadata.get('topic', 'N/A')
                    print(f"         {category} | {topic}")

        print("\nStep 9: Demonstrating document deletion...")
        docs_to_delete = ["health-biotech-009", "env-sustainability-010"]
        delete_result = await client.delete_docs(index_name, docs_to_delete)
        print(f"Deleted documents (job: {delete_result.job_id}, remaining docs: {delete_result.doc_count})")

        print("\nStep 10: Verifying document count after deletion...")
        remaining_docs = await client.get_docs(index_name)
        print(f"Remaining documents: {len(remaining_docs)}")

        print("\nStep 11: Final search validation...")
        final_search = await client.query(
            index_name,
            "technology innovation and automation",
            QueryOptions(top_k=5),
        )

        print("Final search results:")
        print(f"   Query: \"{final_search.query}\"")
        print(f"   Time: {final_search.time_taken_ms} ms")
        print(f"   Results: {len(final_search.docs)}")

        for i, item in enumerate(final_search.docs, 1):
            print(f"   {i}. [{item.id}] Score: {item.score:.3f}")

        print("\nStep 12: Cleaning up - deleting the test index...")
        deleted = await client.delete_index(index_name)
        print(f"Index deleted: {deleted}")

        print("\nComprehensive Moss SDK Example Completed Successfully!")
        print("=" * 60)
        print("Summary of operations performed:")
        print("   - Index creation with initial documents")
        print("   - Index information retrieval")
        print("   - Index listing")
        print("   - Document addition with upsert")
        print("   - Document retrieval (all and specific)")
        print("   - Index loading for querying")
        print("   - Multiple semantic search operations")
        print("   - Document deletion")
        print("   - Index cleanup")
        print("   - Comprehensive error handling")

    except Exception as error:
        print(f"Error occurred: {error}")
        if hasattr(error, 'message'):
            print(f"   Error message: {error.message}")
        if hasattr(error, 'status_code'):
            print(f"   Status code: {error.status_code}")

        # Attempt cleanup even if there was an error
        try:
            print("\nAttempting cleanup due to error...")
            await client.delete_index(index_name)
            print("Cleanup completed")
        except Exception:
            print("Cleanup failed - manual cleanup may be required")


# Export for use in tests or other modules
__all__ = ["comprehensive_moss_example"]


# Run the example
if __name__ == "__main__":
    asyncio.run(comprehensive_moss_example())