01-intro 作業

▌程式

import requests
from elasticsearch import Elasticsearch
from tqdm import tqdm

print("="*50)
print("Q1: 取得 Build Hash")
print("="*50)

# Q1: 取得 Elasticsearch 版本資訊
response = requests.get('http://localhost:9200')
cluster_info = response.json()

build_hash = cluster_info['version']['build_hash']
print(f"Build Hash: {build_hash}")

print("\n" + "="*50)
print("Q2: 載入資料和建立索引")
print("="*50)

# 載入 FAQ 文檔
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []
for course in documents_raw:
    course_name = course['course']
    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

print(f"總共載入 {len(documents)} 個文檔")

# 建立 Elasticsearch 客戶端
es_client = Elasticsearch('http://localhost:9200')

# 定義索引設定(使用講師的格式)
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"}
        }
    }
}

index_name = "course-questions"

# 建立索引(簡化版本)
try:
    es_client.indices.delete(index=index_name, ignore=[400, 404])
except:
    pass

es_client.indices.create(index=index_name, body=index_settings)

# 索引文檔
for doc in tqdm(documents, desc="索引文檔"):
    es_client.index(index=index_name, body=doc)

print("\n=== Q2 答案 ===")
print("使用的函數: index")

print("\n" + "="*50)
print("Q3: 搜尋查詢")
print("="*50)

# Q3: 搜尋 "How do execute a command on a Kubernetes pod?"
def search_elasticsearch(query, course_filter=None, size=10):
    search_query = {
        "size": size,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],  # question 權重 4
                        "type": "best_fields"
                    }
                }
            }
        }
    }
    
    if course_filter:
        search_query["query"]["bool"]["filter"] = {
            "term": {"course": course_filter}
        }
    
    response = es_client.search(index=index_name, body=search_query)
    return response

# 執行 Q3 搜尋
query_q3 = "How do execute a command on a Kubernetes pod?"
response_q3 = search_elasticsearch(query_q3)

# 取得最高分數
top_score = response_q3['hits']['hits'][0]['_score']
print(f"最高分數: {top_score}")

# 顯示前 3 個結果
print("\n搜尋結果:")
for i, hit in enumerate(response_q3['hits']['hits'][:3]):
    print(f"{i+1}. 分數: {hit['_score']:.2f}")
    print(f"   問題: {hit['_source']['question']}")
    print(f"   課程: {hit['_source']['course']}")
    print("---")

print(f"\n=== Q3 答案 ===")
print(f"最高分數: {top_score}")

print("\n" + "="*50)
print("Q4: 過濾搜尋")
print("="*50)

# Q4: 搜尋 "How do copy a file to a Docker container?" 限制 machine-learning-zoomcamp
query_q4 = "How do copy a file to a Docker container?"
response_q4 = search_elasticsearch(query_q4, course_filter="machine-learning-zoomcamp", size=3)

print("Q4 搜尋結果 (machine-learning-zoomcamp, 3個結果):")
for i, hit in enumerate(response_q4['hits']['hits']):
    print(f"{i+1}. 分數: {hit['_score']:.2f}")
    print(f"   問題: {hit['_source']['question']}")
    print(f"   課程: {hit['_source']['course']}")
    print(f"   章節: {hit['_source']['section']}")
    print("---")

# 取得第3個問題
if len(response_q4['hits']['hits']) >= 3:
    third_question = response_q4['hits']['hits'][2]['_source']['question']
    print(f"\n=== Q4 答案 ===")
    print(f"第3個問題: {third_question}")
else:
    print(f"\n注意: 只找到 {len(response_q4['hits']['hits'])} 個結果")

print("\n" + "="*50)
print("Q5: 建立提示")
print("="*50)

# Q5: 使用 Q4 的結果建立提示
context_template = """
Q: {question}
A: {text}
""".strip()

# 從 Q4 結果建立上下文
context_parts = []
for hit in response_q4['hits']['hits']:
    doc = hit['_source']
    context_part = context_template.format(
        question=doc['question'],
        text=doc['text']
    )
    context_parts.append(context_part)

context = "\n\n".join(context_parts)

# 建立完整提示
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

question_q5 = "How do copy a file to a Docker container?"
prompt = prompt_template.format(question=question_q5, context=context)

# 計算提示長度
prompt_length = len(prompt)

print(f"提示長度: {prompt_length} 字符")

# 顯示提示的前 500 字符作為預覽
print(f"\n提示內容預覽 (前500字符):")
print(prompt[:500] + "..." if len(prompt) > 500 else prompt)

print(f"\n=== Q5 答案 ===")
print(f"提示長度: {prompt_length}")

print("\n" + "="*50)
print("Q6: Token 計算")
print("="*50)

# Q6: 使用 tiktoken 計算提示的 token 數量
import tiktoken

# 使用 gpt-4o 的編碼器
encoding = tiktoken.encoding_for_model("gpt-4o")

# 計算 token 數量
tokens = encoding.encode(prompt)
token_count = len(tokens)

print(f"Token 數量: {token_count}")

▌Q1: 取得 Build Hash

=== Q1 答案 ===
Build Hash: dbcbbbd0bc4924cfeb28929dc05d82d662c527b7

▌Q2: 載入資料和建立索引

總共載入 948 個文檔
d:\Dev\LLM-Zoomcamp\01-homework-2.py:57: DeprecationWarning: Passing transport options in the API method is deprecated. Use ‘Elasticsearch.options()’ instead.
es_client.indices.delete(index=index_name, ignore=[400, 404])
d:\Dev\LLM-Zoomcamp\01-homework-2.py:61: DeprecationWarning: The ‘body’ parameter is deprecated and will be removed in a future version. Instead use individual parameters.
es_client.indices.create(index=index_name, body=index_settings)
索引文檔: 0%| | 0/948 [00:00<?, ?it/s]d:\Dev\LLM-Zoomcamp\01-homework-2.py:65: DeprecationWarning: The ‘body’ parameter is deprecated and will be removed in a future version. Instead use the ‘document’ parameter. See Deprecation warnings in 7.15.0 pre-releases · Issue #1698 · elastic/elasticsearch-py · GitHub for more information
es_client.index(index=index_name, body=doc)
索引文檔: 100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:45<00:00, 20.79it/s]

=== Q2 答案 ===
使用的函數: index

▌Q3: 搜尋查詢

d:\Dev\LLM-Zoomcamp\01-homework-2.py:96: DeprecationWarning: The ‘body’ parameter is deprecated and will be removed in a future version. Instead use individual parameters.
response = es_client.search(index=index_name, body=search_query)
最高分數: 44.50556

搜尋結果:

  1. 分數: 44.51
    問題: How do I debug a docker container?
    課程: machine-learning-zoomcamp

  1. 分數: 35.43
    問題: Kubernetes-dashboard
    課程: machine-learning-zoomcamp

  1. 分數: 33.71
    問題: How do I copy files from a different folder into docker container’s working directory?
    課程: machine-learning-zoomcamp

=== Q3 答案 ===
最高分數: 44.50556

▌Q4: 過濾搜尋

Q4 搜尋結果 (machine-learning-zoomcamp, 3個結果):

  1. 分數: 73.39
    問題: How do I debug a docker container?
    課程: machine-learning-zoomcamp
    章節: 5. Deploying Machine Learning Models

  1. 分數: 66.69
    問題: How do I copy files from my local machine to docker container?
    課程: machine-learning-zoomcamp
    章節: 5. Deploying Machine Learning Models

  1. 分數: 59.81
    問題: How do I copy files from a different folder into docker container’s working directory?
    課程: machine-learning-zoomcamp
    章節: 5. Deploying Machine Learning Models

=== Q4 答案 ===
第3個問題: How do I copy files from a different folder into docker container’s working directory?

▌Q5: 建立提示

提示長度: 1446 字符

提示內容預覽 (前500字符):
You’re a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: How do copy a file to a Docker container?

CONTEXT:
Q: How do I debug a docker container?
A: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.
docker run -it --entrypoint bash
If the container is already running, execute a command in the specific contain…

=== Q5 答案 ===
提示長度: 1446

▌Q6: Token 計算

Token 數量: 320

前 10 個 tokens: [63842, 261, 4165, 14029, 29186, 13, 30985, 290, 150339, 4122]
第一個 token (63842) 對應的文字: b"You’re"

=== Q6 答案 ===
Token 數量: 320