8000 geektime-ai-course/07_clustering_and_summarize.ipynb at main · xuwenhao/geektime-ai-course · GitHub
[go: up one dir, main page]

Skip to content
{"payload":{"allShortcutsEnabled":false,"fileTree":{"":{"items":[{"name":".devcontainer","path":".devcontainer","contentType":"directory"},{"name":"data","path":"data","contentType":"directory"},{"name":".env.example","path":".env.example","contentType":"file"},{"name":".gitignore","path":".gitignore","contentType":"file"},{"name":"01_open_ai_101.ipynb","path":"01_open_ai_101.ipynb","contentType":"file"},{"name":"02_zero_shot_classification.ipynb","path":"02_zero_shot_classification.ipynb","contentType":"file"},{"name":"03_food_chatbot.py","path":"03_food_chatbot.py","contentType":"file"},{"name":"03_prompt_chatbot.ipynb","path":"03_prompt_chatbot.ipynb","contentType":"file"},{"name":"04_classification_comparison.ipynb","path":"04_classification_comparison.ipynb","contentType":"file"},{"name":"05_classification_in_ml.ipynb","path":"05_classification_in_ml.ipynb","contentType":"file"},{"name":"06_chatgpt_and_application.ipynb","path":"06_chatgpt_and_application.ipynb","contentType":"file"},{"name":"07_clustering_and_summarize.ipynb","path":"07_clustering_and_summarize.ipynb","contentType":"file"},{"name":"08_edit_and_moderation.ipynb","path":"08_edit_and_moderation.ipynb","contentType":"file"},{"name":"09_semantic_search.ipynb","path":"09_semantic_search.ipynb","contentType":"file"},{"name":"10_llama_index_to_read_a_book.ipynb","path":"10_llama_index_to_read_a_book.ipynb","contentType":"file"},{"name":"11_colab_chatglm_opensource.ipynb","path":"11_colab_chatglm_opensource.ipynb","contentType":"file"},{"name":"11_open_source_embedding.ipynb","path":"11_open_source_embedding.ipynb","contentType":"file"},{"name":"13_unit_test.ipynb","path":"13_unit_test.ipynb","contentType":"file"},{"name":"14_langchain_01.ipynb","path":"14_langchain_01.ipynb","contentType":"file"},{"name":"15_langchain_02.ipynb","path":"15_langchain_02.ipynb","contentType":"file"},{"name":"16_langchain_memory.ipynb","path":"16_langchain_memory.ipynb","contentType":"file"},{"name":"17_langchain_agent.ipynb","path":"17_langchain_agent.ipynb","contentType":"file"},{"name":"18_fine_tune_and_stream.ipynb","path":"18_fine_tune_and_stream.ipynb","contentType":"file"},{"name":"19_whisper_and_gpt.ipynb","path":"19_whisper_and_gpt.ipynb","contentType":"file"},{"name":"19_whisper_local_model.ipynb","path":"19_whisper_local_model.ipynb","contentType":"file"},{"name":"20_tts_and_azure.ipynb","path":"20_tts_and_azure.ipynb","contentType":"file"},{"name":"21_digital_person.ipynb","path":"21_digital_person.ipynb","contentType":"file"},{"name":"22_huggingface.ipynb","path":"22_huggingface.ipynb","contentType":"file"},{"name":"22_huggingface_pipeline_example.ipynb","path":"22_huggingface_pipeline_example.ipynb","contentType":"file"},{"name":"23_clip_and_image_search.ipynb","path":"23_clip_and_image_search.ipynb","contentType":"file"},{"name":"24_stable_diffusion.ipynb","path":"24_stable_diffusion.ipynb","contentType":"file"},{"name":"25_controlnet_canny.ipynb","path":"25_controlnet_canny.ipynb","contentType":"file"},{"name":"25_controlnet_openpose.ipynb","path":"25_controlnet_openpose.ipynb","contentType":"file"},{"name":"25_controlnet_scribble.ipynb","path":"25_controlnet_scribble.ipynb","contentType":"file"},{"name":"AIAssistant.bas","path":"AIAssistant.bas","contentType":"file"},{"name":"LICENSE","path":"LICENSE","contentType":"file"},{"name":"README.md","path":"README.md","contentType":"file"},{"name":"README_zh_CN.md","path":"README_zh_CN.md","contentType":"file"},{"name":"auto_unit_test.py","path":"auto_unit_test.py","contentType":"file"},{"name":"conda-env.yml","path":"conda-env.yml","contentType":"file"},{"name":"requirements-gpu.txt","path":"requirements-gpu.txt","contentType":"file"},{"name":"requirements.txt","path":"requirements.txt","contentType":"file"}],"totalCount":42}},"fileTreeProcessingTime":3.5471719999999998,"foldersToFetch":[],"incompleteFileTree":false,"repo":{"id":613680226,"defaultBranch":"main","name":"geektime-ai-course","ownerLogin":"xuwenhao","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2023-03-14T03:44:41.000Z","ownerAvatar":"https://avatars.githubusercontent.com/u/42074?v=4","public":true,"private":false,"isOrgOwned":false},"codeLineWrapEnabled":false,"symbolsExpanded":false,"treeExpanded":true,"refInfo":{"name":"main","listCacheKey":"v0:1707842726.0","canEdit":false,"refType":"branch","currentOid":"6825e05da76e1d435d66a7d96f02ef55ad1bfd7a"},"path":"07_clustering_and_summarize.ipynb","currentUser":null,"blob":{"rawLines":["{"," \"cells\": ["," {"," \"cell_type\": \"markdown\","," \"metadata\": {},"," \"source\": ["," \"## 基于Embedding向量进行文本聚类\\n\""," ]"," },"," {"," \"cell_type\": \"code\","," \"execution_count\": 3,"," \"metadata\": {},"," \"outputs\": [],"," \"source\": ["," \"from sklearn.datasets import fetch_20newsgroups\\n\","," \"import pandas as pd\\n\","," \"\\n\","," \"def twenty_newsgroup_to_csv():\\n\","," \" newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))\\n\","," \"\\n\","," \" df = pd.DataFrame([newsgroups_train.data, newsgroups_train.target.tolist()]).T\\n\","," \" df.columns = ['text', 'target']\\n\","," \"\\n\","," \" targets = pd.DataFrame( newsgroups_train.target_names, columns=['title'])\\n\","," \"\\n\","," \" out = pd.merge(df, targets, left_on='target', right_index=True)\\n\","," \" out.to_csv('20_newsgroup.csv', index=False)\\n\","," \" \\n\","," \"twenty_newsgroup_to_csv()\\n\""," ]"," },"," {"," \"cell_type\": \"code\","," \"execution_count\": 24,"," \"metadata\": {},"," \"outputs\": ["," {"," \"name\": \"stdout\","," \"output_type\": \"stream\","," \"text\": ["," \"Number of rows before null filtering: 11314\\n\","," \"Number of rows before token number filtering: 11096\\n\","," \"Number of rows data used: 10640\\n\""," ]"," }"," ],"," \"source\": ["," \"from openai.embeddings_utils import get_embeddings\\n\","," \"import openai, os, tiktoken, backoff\\n\","," \"\\n\","," \"openai.api_key = os.environ.get(\\\"OPENAI_API_KEY\\\")\\n\","," \"embedding_model = \\\"text-embedding-ada-002\\\"\\n\","," \"embedding_encoding = \\\"cl100k_base\\\" # this the encoding for text-embedding-ada-002\\n\","," \"batch_size = 2000\\n\","," \"max_tokens = 1000 # the maximum for text-embedding-ada-002 is 8191\\n\","," \"\\n\","," \"df = pd.read_csv('20_newsgroup.csv')\\n\","," \"print(\\\"Number of rows before null filtering:\\\", len(df))\\n\","," \"df = df[df['text'].isnull() == False]\\n\","," \"encoding = tiktoken.get_encoding(embedding_encoding)\\n\","," \"\\n\","," \"df[\\\"n_tokens\\\"] = df.text.apply(lambda x: len(encoding.encode(x)))\\n\","," \"print(\\\"Number of rows before token number filtering:\\\", len(df))\\n\","," \"df = df[df.n_tokens \u003c= max_tokens]\\n\","," \"print(\\\"Number of rows data used:\\\", len(df))\""," ]"," },"," {"," \"cell_type\": \"code\","," \"execution_count\": 8,"," \"metadata\": {},"," \"outputs\": [],"," \"source\": ["," \"# 以下代码比较消耗 Token,你可以不运行\\n\","," \"# @backoff.on_exception(backoff.expo, openai.error.RateLimitError)\\n\","," \"# def get_embeddings_with_backoff(prompts, engine):\\n\","," \"# embeddings = []\\n\","," \"# for i in range(0, len(prompts), batch_size):\\n\","," \"# batch = prompts[i:i+batch_size]\\n\","," \"# embeddings += get_embeddings(list_of_text=batch, engine=engine)\\n\","," \"# return embeddings\\n\","," \"\\n\","," \"# prompts = df.text.tolist()\\n\","," \"# prompt_batches = [prompts[i:i+batch_size] for i in range(0, len(prompts), batch_size)]\\n\","," \"\\n\","," \"# embeddings = []\\n\","," \"# for batch in prompt_batches:\\n\","," \"# batch_embeddings = get_embeddings_with_backoff(prompts=batch, engine=embedding_model)\\n\","," \"# embeddings += batch_embeddings\\n\","," \"\\n\","," \"# df[\\\"embedding\\\"] = embeddings\\n\","," \"# df.to_parquet(\\\"data/20_newsgroup_with_embedding.parquet\\\", index=False)\""," ]"," },"," {"," \"cell_type\": \"code\","," \"execution_count\": 29,"," \"metadata\": {},"," \"outputs\": [],"," \"source\": ["," \"import numpy as np\\n\","," \"from sklearn.cluster import KMeans\\n\","," \"\\n\","," \"embedding_df = pd.read_parquet(\\\"data/20_newsgroup_with_embedding.parquet\\\")\\n\","," \"\\n\","," \"matrix = np.vstack(embedding_df.embedding.values)\\n\","," \"num_of_clusters = 20\\n\","," \"\\n\","," \"kmeans = KMeans(n_clusters=num_of_clusters, init=\\\"k-means++\\\", n_init=10, random_state=42)\\n\","," \"kmeans.fit(matrix)\\n\","," \"labels = kmeans.labels_\\n\","," \"embedding_df[\\\"cluster\\\"] = labels\""," ]"," },"," {"," \"cell_type\": \"code\","," \"execution_count\": 30,"," \"metadata\": {},"," \"outputs\": ["," {"," \"data\": {"," \"text/html\": ["," \"\u003cdiv\u003e\\n\","," \"\u003cstyle scoped\u003e\\n\","," \" .dataframe tbody tr th:only-of-type {\\n\","," \" vertical-align: middle;\\n\","," \" }\\n\","," \"\\n\","," \" .dataframe tbody tr th {\\n\","," \" vertical-align: top;\\n\","," \" }\\n\","," \"\\n\","," \" .dataframe thead th {\\n\","," \" text-align: right;\\n\","," \" }\\n\","," \"\u003c/style\u003e\\n\","," \"\u003ctable border=\\\"1\\\" class=\\\"dataframe\\\"\u003e\\n\","," \" \u003cthead\u003e\\n\","," \" \u003ctr style=\\\"text-align: right;\\\"\u003e\\n\","," \" \u003cth\u003e\u003c/th\u003e\\n\","," \" \u003cth\u003etext\u003c/th\u003e\\n\","," \" \u003cth\u003etarget\u003c/th\u003e\\n\","," \" \u003cth\u003etitle\u003c/th\u003e\\n\","," \" \u003cth\u003en_tokens\u003c/th\u003e\\n\","," \" \u003cth\u003eembedding\u003c/th\u003e\\n\","," \" \u003cth\u003ecluster\u003c/th\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003c/thead\u003e\\n\","," \" \u003ctbody\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e0\u003c/th\u003e\\n\","," \" \u003ctd\u003eI was wondering if anyone out there could enli...\u003c/td\u003e\\n\","," \" \u003ctd\u003e7\u003c/td\u003e\\n\","," \" \u003ctd\u003erec.autos\u003c/td\u003e\\n\","," \" \u003ctd\u003e121\u003c/td\u003e\\n\","," \" \u003ctd\u003e[-0.0391300804913044, 0.013502633199095726, -0...\u003c/td\u003e\\n\","," \" \u003ctd\u003e5\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e1\u003c/th\u003e\\n\","," \" \u003ctd\u003e\\\\nIt depends on your priorities. A lot of peo...\u003c/td\u003e\\n\","," \" \u003ctd\u003e7\u003c/td\u003e\\n\","," \" \u003ctd\u003erec.autos\u003c/td\u003e\\n\","," \" \u003ctd\u003e108\u003c/td\u003e\\n\","," \" \u003ctd\u003e[-0.0011249205563217402, -0.00376517535187304,...\u003c/td\u003e\\n\","," \" \u003ctd\u003e5\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e2\u003c/th\u003e\\n\","," \" \u003ctd\u003ean excellent automatic can be found in the sub...\u003c/td\u003e\\n\","," \" \u003ctd\u003e7\u003c/td\u003e\\n\","," \" \u003ctd\u003erec.autos\u003c/td\u003e\\n\","," \" \u003ctd\u003e476\u003c/td\u003e\\n\","," \" \u003ctd\u003e[-0.018259447067975998, -0.008410007692873478,...\u003c/td\u003e\\n\","," \" \u003ctd\u003e5\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e3\u003c/th\u003e\\n\","," \" \u003ctd\u003e: Ford and his automobile. I need information...\u003c/td\u003e\\n\","," \" \u003ctd\u003e7\u003c/td\u003e\\n\","," \" \u003ctd\u003erec.autos\u003c/td\u003e\\n\","," \" \u003ctd\u003e86\u003c/td\u003e\\n\","," \" \u003ctd\u003e[-0.012589422054588795, 0.006539034191519022, ...\u003c/td\u003e\\n\","," \" \u003ctd\u003e5\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e4\u003c/th\u003e\\n\","," \" \u003ctd\u003e\\\\nYo! Watch the attributions--I didn't say tha...\u003c/td\u003e\\n\","," \" \u003ctd\u003e7\u003c/td\u003e\\n\","," \" \u003ctd\u003erec.autos\u003c/td\u003e\\n\","," \" \u003ctd\u003e130\u003c/td\u003e\\n\","," \" \u003ctd\u003e[-0.0006192282889969647, -0.011226896196603775...\u003c/td\u003e\\n\","," \" \u003ctd\u003e10\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003c/tbody\u003e\\n\","," \"\u003c/table\u003e\\n\","," \"\u003c/div\u003e\""," ],"," \"text/plain\": ["," \" text target title \\\\\\n\","," \"0 I was wondering if anyone out there could enli... 7 rec.autos \\n\","," \"1 \\\\nIt depends on your priorities. A lot of peo... 7 rec.autos \\n\","," \"2 an excellent automatic can be found in the sub... 7 rec.autos \\n\","," \"3 : Ford and his automobile. I need information... 7 rec.autos \\n\","," \"4 \\\\nYo! Watch the attributions--I didn't say tha... 7 rec.autos \\n\","," \"\\n\","," \" n_tokens embedding cluster \\n\","," \"0 121 [-0.0391300804913044, 0.013502633199095726, -0... 5 \\n\","," \"1 108 [-0.0011249205563217402, -0.00376517535187304,... 5 \\n\","," \"2 476 [-0.018259447067975998, -0.008410007692873478,... 5 \\n\","," \"3 86 [-0.012589422054588795, 0.006539034191519022, ... 5 \\n\","," \"4 130 [-0.0006192282889969647, -0.011226896196603775... 10 \""," ]"," },"," \"execution_count\": 30,"," \"metadata\": {},"," \"output_type\": \"execute_result\""," }"," ],"," \"source\": ["," \"embedding_df.head()\""," ]"," },"," {"," \"cell_type\": \"code\","," \"execution_count\": 78,"," \"metadata\": {},"," \"outputs\": ["," {"," \"data\": {"," \"text/html\": ["," \"\u003cdiv\u003e\\n\","," \"\u003cstyle scoped\u003e\\n\","," \" .dataframe tbody tr th:only-of-type {\\n\","," \" vertical-align: middle;\\n\","," \" }\\n\","," \"\\n\","," \" .dataframe tbody tr th {\\n\","," \" vertical-align: top;\\n\","," \" }\\n\","," \"\\n\","," \" .dataframe thead th {\\n\","," \" text-align: right;\\n\","," \" }\\n\","," \"\u003c/style\u003e\\n\","," \"\u003ctable border=\\\"1\\\" class=\\\"dataframe\\\"\u003e\\n\","," \" \u003cthead\u003e\\n\","," \" \u003ctr style=\\\"text-align: right;\\\"\u003e\\n\","," \" \u003cth\u003e\u003c/th\u003e\\n\","," \" \u003cth\u003ecluster\u003c/th\u003e\\n\","," \" \u003cth\u003ecount\u003c/th\u003e\\n\","," \" \u003cth\u003erank1\u003c/th\u003e\\n\","," \" \u003cth\u003erank1_count\u003c/th\u003e\\n\","," \" \u003cth\u003erank2\u003c/th\u003e\\n\","," \" \u003cth\u003erank2_count\u003c/th\u003e\\n\","," \" \u003cth\u003eper_1\u003c/th\u003e\\n\","," \" \u003cth\u003eper_1_2\u003c/th\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003c/thead\u003e\\n\","," \" \u003ctbody\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e0\u003c/th\u003e\\n\","," \" \u003ctd\u003e0\u003c/td\u003e\\n\","," \" \u003ctd\u003e432\u003c/td\u003e\\n\","," \" \u003ctd\u003ecomp.windows.x\u003c/td\u003e\\n\","," \" \u003ctd\u003e406\u003c/td\u003e\\n\","," \" \u003ctd\u003ecomp.sys.mac.hardware\u003c/td\u003e\\n\","," \" \u003ctd\u003e1.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e93.98%\u003c/td\u003e\\n\","," \" \u003ctd\u003e94.21%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e1\u003c/th\u003e\\n\","," \" \u003ctd\u003e1\u003c/td\u003e\\n\","," \" \u003ctd\u003e418\u003c/td\u003e\\n\","," \" \u003ctd\u003esci.space\u003c/td\u003e\\n\","," \" \u003ctd\u003e388\u003c/td\u003e\\n\","," \" \u003ctd\u003ealt.atheism\u003c/td\u003e\\n\","," \" \u003ctd\u003e2.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e92.82%\u003c/td\u003e\\n\","," \" \u003ctd\u003e93.30%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e2\u003c/th\u003e\\n\","," \" \u003ctd\u003e2\u003c/td\u003e\\n\","," \" \u003ctd\u003e1035\u003c/td\u003e\\n\","," \" \u003ctd\u003ecomp.sys.ibm.pc.hardware\u003c/td\u003e\\n\","," \" \u003ctd\u003e396\u003c/td\u003e\\n\","," \" \u003ctd\u003ecomp.sys.mac.hardware\u003c/td\u003e\\n\","," \" \u003ctd\u003e387.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e38.26%\u003c/td\u003e\\n\","," \" \u003ctd\u003e75.65%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e3\u003c/th\u003e\\n\","," \" \u003ctd\u003e3\u003c/td\u003e\\n\","," \" \u003ctd\u003e471\u003c/td\u003e\\n\","," \" \u003ctd\u003erec.sport.hockey\u003c/td\u003e\\n\","," \" \u003ctd\u003e455\u003c/td\u003e\\n\","," \" \u003ctd\u003e0\u003c/td\u003e\\n\","," \" \u003ctd\u003e0.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e96.60%\u003c/td\u003e\\n\","," \" \u003ctd\u003e96.60%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e4\u003c/th\u003e\\n\","," \" \u003ctd\u003e4\u003c/td\u003e\\n\","," \" \u003ctd\u003e716\u003c/td\u003e\\n\","," \" \u003ctd\u003etalk.politics.misc\u003c/td\u003e\\n\","," \" \u003ctd\u003e270\u003c/td\u003e\\n\","," \" \u003ctd\u003ealt.atheism\u003c/td\u003e\\n\","," \" \u003ctd\u003e150.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e37.71%\u003c/td\u003e\\n\","," \" \u003ctd\u003e58.66%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e5\u003c/th\u003e\\n\","," \" \u003ctd\u003e5\u003c/td\u003e\\n\","," \" \u003ctd\u003e511\u003c/td\u003e\\n\","," \" \u003ctd\u003erec.autos\u003c/td\u003e\\n\","," \" \u003ctd\u003e420\u003c/td\u003e\\n\","," \" \u003ctd\u003ecomp.sys.mac.hardware\u003c/td\u003e\\n\","," \" \u003ctd\u003e6.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e82.19%\u003c/td\u003e\\n\","," \" \u003ctd\u003e83.37%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e6\u003c/th\u003e\\n\","," \" \u003ctd\u003e6\u003c/td\u003e\\n\","," \" \u003ctd\u003e870\u003c/td\u003e\\n\","," \" \u003ctd\u003erec.motorcycles\u003c/td\u003e\\n\","," \" \u003ctd\u003e100\u003c/td\u003e\\n\","," \" \u003ctd\u003ealt.atheism\u003c/td\u003e\\n\","," \" \u003ctd\u003e73.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e11.49%\u003c/td\u003e\\n\","," \" \u003ctd\u003e19.89%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e7\u003c/th\u003e\\n\","," \" \u003ctd\u003e7\u003c/td\u003e\\n\","," \" \u003ctd\u003e570\u003c/td\u003e\\n\","," \" \u003ctd\u003ecomp.os.ms-windows.misc\u003c/td\u003e\\n\","," \" \u003ctd\u003e338\u003c/td\u003e\\n\","," \" \u003ctd\u003ecomp.sys.mac.hardware\u003c/td\u003e\\n\","," \" \u003ctd\u003e46.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e59.30%\u003c/td\u003e\\n\","," \" \u003ctd\u003e67.37%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e8\u003c/th\u003e\\n\","," \" \u003ctd\u003e8\u003c/td\u003e\\n\","," \" \u003ctd\u003e435\u003c/td\u003e\\n\","," \" \u003ctd\u003etalk.politics.mideast\u003c/td\u003e\\n\","," \" \u003ctd\u003e372\u003c/td\u003e\\n\","," \" \u003ctd\u003ealt.atheism\u003c/td\u003e\\n\","," \" \u003ctd\u003e23.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e85.52%\u003c/td\u003e\\n\","," \" \u003ctd\u003e90.80%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e9\u003c/th\u003e\\n\","," \" \u003ctd\u003e9\u003c/td\u003e\\n\","," \" \u003ctd\u003e84\u003c/td\u003e\\n\","," \" \u003ctd\u003ecomp.os.ms-windows.misc\u003c/td\u003e\\n\","," \" \u003ctd\u003e8\u003c/td\u003e\\n\","," \" \u003ctd\u003ecomp.sys.mac.hardware\u003c/td\u003e\\n\","," \" \u003ctd\u003e8.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e9.52%\u003c/td\u003e\\n\","," \" \u003ctd\u003e19.05%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e10\u003c/th\u003e\\n\","," \" \u003ctd\u003e10\u003c/td\u003e\\n\","," \" \u003ctd\u003e527\u003c/td\u003e\\n\","," \" \u003ctd\u003etalk.politics.guns\u003c/td\u003e\\n\","," \" \u003ctd\u003e382\u003c/td\u003e\\n\","," \" \u003ctd\u003etalk.religion.misc\u003c/td\u003e\\n\","," \" \u003ctd\u003e35.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e72.49%\u003c/td\u003e\\n\","," \" \u003ctd\u003e79.13%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e11\u003c/th\u003e\\n\","," \" \u003ctd\u003e11\u003c/td\u003e\\n\","," \" \u003ctd\u003e554\u003c/td\u003e\\n\","," \" \u003ctd\u003ecomp.graphics\u003c/td\u003e\\n\","," \" \u003ctd\u003e324\u003c/td\u003e\\n\","," \" \u003ctd\u003ecomp.sys.mac.hardware\u003c/td\u003e\\n\","," \" \u003ctd\u003e17.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e58.48%\u003c/td\u003e\\n\","," \" \u003ctd\u003e61.55%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e12\u003c/th\u003e\\n\","," \" \u003ctd\u003e12\u003c/td\u003e\\n\","," \" \u003ctd\u003e372\u003c/td\u003e\\n\","," \" \u003ctd\u003erec.motorcycles\u003c/td\u003e\\n\","," \" \u003ctd\u003e355\u003c/td\u003e\\n\","," \" \u003ctd\u003ealt.atheism\u003c/td\u003e\\n\","," \" \u003ctd\u003e1.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e95.43%\u003c/td\u003e\\n\","," \" \u003ctd\u003e95.70%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e13\u003c/th\u003e\\n\","," \" \u003ctd\u003e13\u003c/td\u003e\\n\","," \" \u003ctd\u003e840\u003c/td\u003e\\n\","," \" \u003ctd\u003esoc.religion.christian\u003c/td\u003e\\n\","," \" \u003ctd\u003e460\u003c/td\u003e\\n\","," \" \u003ctd\u003ealt.atheism\u003c/td\u003e\\n\","," \" \u003ctd\u003e188.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e54.76%\u003c/td\u003e\\n\","," \" \u003ctd\u003e77.14%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e14\u003c/th\u003e\\n\","," \" \u003ctd\u003e14\u003c/td\u003e\\n\","," \" \u003ctd\u003e468\u003c/td\u003e\\n\","," \" \u003ctd\u003erec.sport.baseball\u003c/td\u003e\\n\","," \" \u003ctd\u003e451\u003c/td\u003e\\n\","," \" \u003ctd\u003e0\u003c/td\u003e\\n\","," \" \u003ctd\u003e0.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e96.37%\u003c/td\u003e\\n\","," \" \u003ctd\u003e96.37%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e15\u003c/th\u003e\\n\","," \" \u003ctd\u003e15\u003c/td\u003e\\n\","," \" \u003ctd\u003e538\u003c/td\u003e\\n\","," \" \u003ctd\u003emisc.forsale\u003c/td\u003e\\n\","," \" \u003ctd\u003e438\u003c/td\u003e\\n\","," \" \u003ctd\u003ecomp.sys.mac.hardware\u003c/td\u003e\\n\","," \" \u003ctd\u003e27.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e81.41%\u003c/td\u003e\\n\","," \" \u003ctd\u003e86.43%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e16\u003c/th\u003e\\n\","," \" \u003ctd\u003e16\u003c/td\u003e\\n\","," \" \u003ctd\u003e363\u003c/td\u003e\\n\","," \" \u003ctd\u003esci.crypt\u003c/td\u003e\\n\","," \" \u003ctd\u003e349\u003c/td\u003e\\n\","," \" \u003ctd\u003ecomp.sys.mac.hardware\u003c/td\u003e\\n\","," \" \u003ctd\u003e1.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e96.14%\u003c/td\u003e\\n\","," \" \u003ctd\u003e96.42%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e17\u003c/th\u003e\\n\","," \" \u003ctd\u003e17\u003c/td\u003e\\n\","," \" \u003ctd\u003e479\u003c/td\u003e\\n\","," \" \u003ctd\u003esci.electronics\u003c/td\u003e\\n\","," \" \u003ctd\u003e336\u003c/td\u003e\\n\","," \" \u003ctd\u003ecomp.sys.mac.hardware\u003c/td\u003e\\n\","," \" \u003ctd\u003e28.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e70.15%\u003c/td\u003e\\n\","," \" \u003ctd\u003e75.99%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e18\u003c/th\u003e\\n\","," \" \u003ctd\u003e18\u003c/td\u003e\\n\","," \" \u003ctd\u003e425\u003c/td\u003e\\n\","," \" \u003ctd\u003esci.med\u003c/td\u003e\\n\","," \" \u003ctd\u003e402\u003c/td\u003e\\n\","," \" \u003ctd\u003ealt.atheism\u003c/td\u003e\\n\","," \" \u003ctd\u003e1.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e94.59%\u003c/td\u003e\\n\","," \" \u003ctd\u003e94.82%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003ctr\u003e\\n\","," \" \u003cth\u003e19\u003c/th\u003e\\n\","," \" \u003ctd\u003e19\u003c/td\u003e\\n\","," \" \u003ctd\u003e532\u003c/td\u003e\\n\","," \" \u003ctd\u003esci.electronics\u003c/td\u003e\\n\","," \" \u003ctd\u003e62\u003c/td\u003e\\n\","," \" \u003ctd\u003ecomp.sys.mac.hardware\u003c/td\u003e\\n\","," \" \u003ctd\u003e23.0\u003c/td\u003e\\n\","," \" \u003ctd\u003e11.65%\u003c/td\u003e\\n\","," \" \u003ctd\u003e15.98%\u003c/td\u003e\\n\","," \" \u003c/tr\u003e\\n\","," \" \u003c/tbody\u003e\\n\","," \"\u003c/table\u003e\\n\","," \"\u003c/div\u003e\""," ],"," \"text/plain\": ["," \" cluster count rank1 rank1_count \\\\\\n\","," \"0 0 432 comp.windows.x 406 \\n\","," \"1 1 418 sci.space 388 \\n\","," \"2 2 1035 comp.sys.ibm.pc.hardware 396 \\n\","," \"3 3 471 rec.sport.hockey 455 \\n\","," \"4 4 716 talk.politics.misc 270 \\n\","," \"5 5 511 rec.autos 420 \\n\","," \"6 6 870 rec.motorcycles 100 \\n\","," \"7 7 570 comp.os.ms-windows.misc 338 \\n\","," \"8 8 435 talk.politics.mideast 372 \\n\","," \"9 9 84 comp.os.ms-windows.misc 8 \\n\","," \"10 10 527 talk.politics.guns 382 \\n\","," \"11 11 554 comp.graphics 324 \\n\","," \"12 12 372 rec.motorcycles 355 \\n\","," \"13 13 840 soc.religion.christian 460 \\n\","," \"14 14 468 rec.sport.baseball 451 \\n\","," \"15 15 538 misc.forsale 438 \\n\","," \"16 16 363 sci.crypt 349 \\n\","," \"17 17 479 sci.electronics 336 \\n\","," \"18 18 425 sci.med 402 \\n\","," \"19 19 532 sci.electronics 62 \\n\","," \"\\n\","," \" rank2 rank2_count per_1 per_1_2 \\n\","," \"0 comp.sys.mac.hardware 1.0 93.98% 94.21% \\n\","," \"1 alt.atheism 2.0 92.82% 93.30% \\n\","," \"2 comp.sys.mac.hardware 387.0 38.26% 75.65% \\n\","," \"3 0 0.0 96.60% 96.60% \\n\","," \"4 alt.atheism 150.0 37.71% 58.66% \\n\","," \"5 comp.sys.mac.hardware 6.0 82.19% 83.37% \\n\","," \"6 alt.atheism 73.0 11.49% 19.89% \\n\","," \"7 comp.sys.mac.hardware 46.0 59.30% 67.37% \\n\","," \"8 alt.atheism 23.0 85.52% 90.80% \\n\","," \"9 comp.sys.mac.hardware 8.0 9.52% 19.05% \\n\","," \"10 talk.religion.misc 35.0 72.49% 79.13% \\n\","," \"11 comp.sys.mac.hardware 17.0 58.48% 61.55% \\n\","," \"12 alt.atheism 1.0 95.43% 95.70% \\n\","," \"13 alt.atheism 188.0 54.76% 77.14% \\n\","," \"14 0 0.0 96.37% 96.37% \\n\","," \"15 comp.sys.mac.hardware 27.0 81.41% 86.43% \\n\","," \"16 comp.sys.mac.hardware 1.0 96.14% 96.42% \\n\","," \"17 comp.sys.mac.hardware 28.0 70.15% 75.99% \\n\","," \"18 alt.atheism 1.0 94.59% 94.82% \\n\","," \"19 comp.sys.mac.hardware 23.0 11.65% 15.98% \""," ]"," },"," \"metadata\": {},"," \"output_type\": \"display_data\""," }"," ],"," \"source\": ["," \"\\n\","," \"# 统计每个cluster的数量\\n\","," \"new_df = embedding_df.groupby('cluster')['cluster'].count().reset_index(name='count')\\n\","," \"\\n\","," \"# 统计这个cluster里最多的分类的数量\\n\","," \"title_count = embedding_df.groupby(['cluster', 'title']).size().reset_index(name='title_count')\\n\","," \"first_titles = title_count.groupby('cluster').apply(lambda x: x.nlargest(1, columns=['title_count']))\\n\","," \"first_titles = first_titles.reset_index(drop=True)\\n\","," \"new_df = pd.merge(new_df, first_titles[['cluster', 'title', 'title_count']], on='cluster', how='left')\\n\","," \"new_df = new_df.rename(columns={'title': 'rank1', 'title_count': 'rank1_count'})\\n\","," \"\\n\","," \"# 统计这个cluster里第二多的分类的数量\\n\","," \"second_titles = title_count[~title_count['title'].isin(first_titles['title'])]\\n\","," \"second_titles = second_titles.groupby('cluster').apply(lambda x: x.nlargest(1, columns=['title_count']))\\n\","," \"second_title 81EA s = second_titles.reset_index(drop=True)\\n\","," \"new_df = pd.merge(new_df, second_titles[['cluster', 'title', 'title_count']], on='cluster', how='left')\\n\","," \"new_df = new_df.rename(columns={'title': 'rank2', 'title_count': 'rank2_count'})\\n\","," \"new_df.fillna(0, inplace=True)\\n\","," \"new_df['per_1'] = (new_df['rank1_count'] / new_df['count']).map(lambda x: '{:.2%}'.format(x))\\n\","," \"new_df['per_1_2'] = ((new_df['rank1_count'] + new_df['rank2_count'])/ new_df['count']).map(lambda x: '{:.2%}'.format(x))\\n\","," \"\\n\","," \"# 将缺失值替换为 0\\n\","," \"# 输出结果\\n\","," \"display(new_df)\""," ]"," },"," {"," \"cell_type\": \"markdown\","," \"metadata\": {},"," \"source\": ["," \"## 使用提示语对文本进行总结\""," ]"," },"," {"," \"cell_type\": \"code\","," \"execution_count\": 94,"," \"metadata\": {},"," \"outputs\": ["," {"," \"name\": \"stdout\","," \"output_type\": \"stream\","," \"text\": ["," \"Cluster 0, Rank 1: comp.windows.x, Theme: Xlib编程\\n\","," \"Cluster 1, Rank 1: sci.space, Theme: 太空技术与航空\\n\","," \"Cluster 2, Rank 1: comp.sys.ibm.pc.hardware, Theme: PC硬件与系统\\n\","," \"Cluster 3, Rank 1: rec.sport.hockey, Theme: 欧洲冰球vs北美冰球\\n\","," \"Cluster 4, Rank 1: talk.politics.misc, Theme: 社会观点与自由\\n\","," \"Cluster 5, Rank 1: rec.autos, Theme: 汽车硬件\\n\","," \"Cluster 6, Rank 1: rec.motorcycles, Theme: 数学与文化冲击\\n\","," \"Cluster 7, Rank 1: comp.os.ms-windows.misc, Theme: PC软件与硬件\\n\","," \"Cluster 8, Rank 1: talk.politics.mideast, Theme: “穆斯林大屠杀”\\n\","," \"Cluster 9, Rank 1: comp.os.ms-windows.misc, Theme: 科技产品\\\"\\\"\\\"\\n\","," \"Cluster 10, Rank 1: talk.politics.guns, Theme: 枪支管制与安全\\n\","," \"Cluster 11, Rank 1: comp.graphics, Theme: 计算机编程与硬件\\n\","," \"Cluster 12, Rank 1: rec.motorcycles, Theme: 骑行安全与技巧\\n\","," \"Cluster 13, Rank 1: soc.religion.christian, Theme: 宗教信仰与实践\\n\","," \"Cluster 14, Rank 1: rec.sport.baseball, Theme: 棒球联盟\\n\","," \"Cluster 15, Rank 1: misc.forsale, Theme: 购物优惠和出售\\n\","," \"Cluster 16, Rank 1: sci.crypt, Theme: 关于加密政策的讨论\\n\","," \"Cluster 17, Rank 1: sci.electronics, Theme: 电子设备技术\\n\","," \"Cluster 18, Rank 1: sci.med, Theme: 药物和疾病\\n\","," \"Cluster 19, Rank 1: sci.electronics, Theme: 电子邮件使用者研究\\n\""," ]"," }"," ],"," \"source\": ["," \"items_per_cluster = 10\\n\","," \"COMPLETIONS_MODEL = \\\"text-davinci-003\\\"\\n\","," \"\\n\","," \"for i in range(num_of_clusters):\\n\","," \" cluster_name = new_df[new_df.cluster == i].iloc[0].rank1\\n\","," \" print(f\\\"Cluster {i}, Rank 1: {cluster_name}, Theme:\\\", end=\\\" \\\")\\n\","," \"\\n\","," \" content = \\\"\\\\n\\\".join(\\n\","," \" embedding_df[embedding_df.cluster == i].text.sample(items_per_cluster, random_state=42).values\\n\","," \" )\\n\","," \" response = openai.Completion.create(\\n\","," \" model=COMPLETIONS_MODEL,\\n\","," \" prompt=f'''我们想要给下面的内容,分组成有意义的类别,以便我们可以对其进行总结。请根据下面这些内容的共同点,总结一个50个字以内的新闻组的名称么?比如 “PC硬件”\\\\n\\\\n内容:\\\\n\\\"\\\"\\\"\\\\n{content}\\\\n\\\"\\\"\\\"新闻组名称:''',\\n\","," \" temperature=0,\\n\","," \" max_tokens=100,\\n\","," \" top_p=1,\\n\","," \" )\\n\","," \" print(response[\\\"choices\\\"][0][\\\"text\\\"].replace(\\\"\\\\n\\\", \\\"\\\"))\\n\""," ]"," },"," {"," \"cell_type\": \"code\","," \"execution_count\": 101,"," \"metadata\": {},"," \"outputs\": ["," {"," \"name\": \"stdout\","," \"output_type\": \"stream\","," \"text\": ["," \"Cluster 0, Rank 1: comp.windows.x, 抽样翻译: 没有实际执行它?不知怎么回事,我的一个xterminal用户使得只要点击鼠标右键,就会自动杀死所有客户端-哦,我的:-(谢谢,Fish\\n\","," \"Cluster 1, Rank 1: sci.space, 抽样翻译: 韦恩·马森和他的团伙在阿拉巴马州发生了什么?我还听说有一个未经证实的谣言,即航空大使们已经消失了。有其他人可以证实吗?\\n\","," \"Cluster 2, Rank 1: comp.sys.ibm.pc.hardware, 抽样翻译: 我怀疑这不是一个特定于Quadra的问题。去年我不得不放弃我“古老”的Bernoulli 20(每个磁带的价格大约是90美元,使整个事情的价值超过我的整个电脑;)。Ocean Microsystems的技术支持人员建议可以使用一些第三方驱动程序来解决这个问题 - 在我的情况下,磁带无法格式化/挂载/分区用于A / UX。\\n\","," \"Cluster 3, Rank 1: rec.sport.hockey, 抽样翻译: 我相信那是4-1。罗德·布林道·阿莫尔在第三节19.59时分攻入一球。\\n\","," \"Cluster 4, Rank 1: talk.politics.misc, 抽样翻译: 为了确保每个人都清楚:“它从未有过”是指“保护”,而不是“未能保护”;即,在我的一生中,我从未见过美国政府始终保护美国公民的利益,除非是意外。\\n\","," \"Cluster 5, Rank 1: rec.autos, 抽样翻译: 噢,来吧,傻瓜,你要做的就是在你的引擎罩上割一个洞,然后把一个管子放进去,这样你就可以把机油倒进去了。你觉得那些热门车上的大空气进气装置是干什么的?它们只是为了外观,没有人知道,它们提供了进入机油填充孔的途径。\\n\","," \"Cluster 6, Rank 1: rec.motorcycles, 抽样翻译: 你真是个失败者\\n\","," \"Cluster 7, Rank 1: comp.os.ms-windows.misc, 抽样翻译: 偶尔你需要为表现良好的东西说句好话。我的东西桥3401没有任何问题。它在DOS和OS/2上运行得很好。对于OS/2,你不需要加载任何特殊的驱动程序。安装会检测到它是一个东西桥驱动器,然后就完成了。顺便说一句,它也很快!\\n\","," \"Cluster 8, Rank 1: talk.politics.mideast, 抽样翻译: Avi, 供你参考,伊斯兰教允许宗教自由——在宗教上没有强制。犹太教是否也允许宗教自由(即是否认可非犹太人)?只是好奇而已。\\n\","," \"Cluster 9, Rank 1: comp.os.ms-windows.misc, 抽样翻译: 每个人都有自己的梦想,但只有勇敢追求梦想的人才能实现它。\\n\","," \"Cluster 10, Rank 1: talk.politics.guns, 抽样翻译: 不一定,特别是如果强奸犯被认定为此。例如,如果你有意地把手指伸进一个装满了老鼠夹的地方,然后被夹住,这是谁的错?\\n\","," \"Cluster 11, Rank 1: comp.graphics, 抽样翻译: 帮帮我!!我需要代码/包/任何东西来处理3D数据,并将其转换为带有隐藏线的线框表面。我正在使用DOS机器,代码可以是ANSI C或C ++,ANSI Fortran或Basic。我使用的数据形成一个矩形网格。请将您的回复发布到网络上,以便其他人受益。我的个人观点是,这是一个普遍的兴趣问题。谢谢!!!!!\\n\","," \"Cluster 12, Rank 1: rec.motorcycles, 抽样翻译: 这是一段心理学,对于任何长期骑行者来说都是必不可少的。人们不会去想“如果我这么做会有其他人受到影响吗?”他们只会评估“如果我这么做会受到影响吗?”\\n\","," \"Cluster 13, Rank 1: soc.religion.christian, 抽样翻译: 这是一个非常薄弱的论点,因为没有独立的支持文本(关于关键事件)。至于新约最古老的现存文本的日期......如果现在只有一个关于美国内战的现存文本,你会怎么想?现在考虑一个大部分文盲的人口,每一份手稿都是手工复制的......--Hal\\n\","," \"Cluster 14, Rank 1: rec.sport.baseball, 抽样翻译: 这个赔率意味着你下注5美元赌反败者赢8美元,或者下注9美元赌胜者赢5美元。\\n\","," \"Cluster 15, Rank 1: misc.forsale, 抽样翻译: 嗯,标题就是这样......我正在寻找便宜的二手TG-16游戏,它们支持2个或更多玩家(同时)....请给我发送所有带有价格的报价。\\n\","," \"Cluster 16, Rank 1: sci.crypt, 抽样翻译: 哪里?老实说,我没有看到任何……我不同意,至少有其他标准已经存在。此外,即使他们限制NREN上的加密,谁在乎呢?大部分互联网都是商业的。NREN只适用于政府和大学研究(阅读提案-它是一条“数据高速公路”,与互联网无关)。\\n\","," \"Cluster 17, Rank 1: sci.electronics, 抽样翻译: 动态RAM不是基于翻转锁存器;基本上每个位只有一个晶体管和电容来存储!静态RAM是基于翻转锁存器,更加昂贵,密度也更低。如果忽略电子和热膨胀,两者都没有任何“移动”的部件...Chris\\n\","," \"Cluster 18, Rank 1: sci.med, 抽样翻译: 化学品已经消失,感谢所有的回应。\\n\","," \"Cluster 19, Rank 1: sci.electronics, 抽样翻译: 尝试lyman.pppl.gov--/pub/8051\\n\""," ]"," }"," ],"," \"source\": ["," \"items_per_cluster = 1\\n\","," \"COMPLETIONS_MODEL = \\\"text-davinci-003\\\"\\n\","," \"\\n\","," \"for i in range(num_of_clusters):\\n\","," \" cluster_name = new_df[new_df.cluster == i].iloc[0].rank1\\n\","," \" print(f\\\"Cluster {i}, Rank 1: {cluster_name}, 抽样翻译:\\\", end=\\\" \\\")\\n\","," \"\\n\","," \" content = \\\"\\\\n\\\".join(\\n\","," \" embedding_df[(embedding_df.cluster == i) \u0026 (embedding_df.n_tokens \u003c 100)].text.sample(items_per_cluster, random_state=42).values\\n\","," \" )\\n\","," \" response = openai.Completion.create(\\n\","," \" model=COMPLETIONS_MODEL,\\n\","," \" prompt=f'''请把下面的内容翻译成中文\\\\n\\\\n内容:\\\\n\\\"\\\"\\\"\\\\n{content}\\\\n\\\"\\\"\\\"翻译:''',\\n\","," \" temperature=0,\\n\","," \" max_tokens=500,\\n\","," \" top_p=1,\\n\","," \" )\\n\","," \" print(response[\\\"choices\\\"][0][\\\"text\\\"].replace(\\\"\\\\n\\\", \\\"\\\"))\""," ]"," },"," {"," \"cell_type\": \"markdown\","," \"metadata\": {},"," \"source\": ["," \"从输出的结果来看,我们选取的内容和AI总结的标题的确八九不离十。不知道这个过程,有没有重新让你回忆起Completion这个接口的强大之处。我们只要给出合适的指令,它就能按照我们的心意给出合理的结果。\\n\""," ]"," },"," {"," \"cell_type\": \"markdown\","," \"metadata\": {},"," \"source\": ["," \"## 利用文本摘要支持无限轮数的聊天\""," ]"," },"," {"," \"cell_type\": \"code\","," \"execution_count\": 106,"," \"metadata\": {"," \"tags\": []"," },"," \"outputs\": ["," {"," \"name\": \"stdout\","," \"output_type\": \"stream\","," \"text\": ["," \"\\n\","," \"User和Assistant聊了鱼香肉丝和蚝油牛肉的制作方法。User问了Assistant两个关于如何做鱼香肉丝和蚝油牛肉的问题,Assistant给出了回答并介绍了每道菜的具体制作方法,同时也提示了调料的配比和牛肉制作时要注意的细节。\\n\""," ]"," }"," ],"," \"source\": ["," \"history = \\\"\\\"\\\"User : 你是谁?\\n\","," \"Assistant : 我是一个人工智能语言模型,可以回答各种问题,包括做菜的问题。\\n\","," \"\\n\","," \"User : 请问鱼香肉丝怎么做?\\n\","," \"Assistant : 鱼香肉丝的做法:\\n\","," \"\\n\","," \"1. 猪里脊肉切丝,加入盐、淀粉、料酒搅拌均匀,腌制10分钟\\n\","," \"2. 胡萝卜、青椒、木耳切丝备用\\n\","," \"3. 热锅凉油,放入蒜末、姜末煸炒出香味,加入肉丝翻炒至变色\\n\","," \"4. 加入胡萝卜、青椒、木耳翻炒,加入酱油、糖、醋、盐、味精、水淀粉搅拌均匀,倒入锅中翻炒均匀即可。\\n\","," \"\\n\","," \"提示:鱼香肉丝的关键是调料的配比,酱油、糖、醋、盐、味精的比例要适合自己的口味。\\n\","," \"\\n\","," \"User : 那蚝油牛肉呢?\\n\","," \"Assistant : 蚝油牛肉的做法:\\n\","," \"\\n\","," \"1. 将牛肉切成薄片,加入盐、淀粉、料酒腌制10分钟\\n\","," \"2. 葱姜切末,蒜切片,青红椒切丝备用\\n\","," \"3. 热锅凉油,放入葱姜末、蒜片煸炒出香味,加入牛肉翻炒至变色\\n\","," \"4. 加入蚝油、酱油、糖、水淀粉调味,加入青红椒翻炒均匀即可。\\n\","," \"\\n\","," \"提示:蚝油牛肉的关键是牛肉要切薄,翻炒时火候要快,保证牛肉口感鲜嫩。调味时,蚝油和酱油的比例也要适合自己的口味。\\n\","," \"\\\"\\\"\\\"\\n\","," \"\\n\","," \"def summarize(text, max_tokens=200):\\n\","," \" response = openai.Completion.create(\\n\","," \" model=COMPLETIONS_MODEL,\\n\","," \" prompt=text + \\\"\\\\n\\\\n请总结一下上面User和Assistant聊了些什么:\\\\n\\\",\\n\","," \" max_tokens=max_tokens,\\n\","," \" )\\n\","," \" return response[\\\"choices\\\"][0][\\\"text\\\"]\\n\","," \"\\n\","," \"summarized = summarize(history)\\n\","," \"print(summarized)\""," ]"," },"," {"," \"cell_type\": \"code\","," \"execution_count\": 107,"," \"metadata\": {"," \"tags\": []"," },"," \"outputs\": [],"," \"source\": ["," \"\\n\","," \"class Conversation:\\n\","," \" def __init__(self, prompt, num_of_round):\\n\","," \" self.prompt = prompt\\n\","," \" self.num_of_round = num_of_round\\n\","," \" self.messages = []\\n\","," \" self.messages.append({\\\"role\\\": \\\"system\\\", \\\"content\\\": self.prompt})\\n\","," \"\\n\","," \" def ask(self, question):\\n\","," \" try:\\n\","," \" self.messages.append( {\\\"role\\\": \\\"user\\\", \\\"content\\\": question})\\n\","," \" response = openai.ChatCompletion.create(\\n\","," \" model=\\\"gpt-3.5-turbo\\\",\\n\","," \" messages=self.messages,\\n\","," \" temperature=0.5,\\n\","," \" max_tokens=2048,\\n\","," \" top_p=1,\\n\","," \" )\\n\","," \" except Exception as e:\\n\","," \" print(e)\\n\","," \" return e\\n\","," \"\\n\","," \" message = response[\\\"choices\\\"][0][\\\"message\\\"][\\\"content\\\"]\\n\","," \" self.messages.append({\\\"role\\\": \\\"assistant\\\", \\\"content\\\": message})\\n\","," \" \\n\","," \" if len(self.messages) \u003e self.num_of_round*2 + 1:\\n\","," \" del self.messages[1:3]\\n\","," \" return message\""," ]"," },"," {"," \"cell_type\": \"code\","," \"execution_count\": 108,"," \"metadata\": {},"," \"outputs\": ["," {"," \"name\": \"stdout\","," \"output_type\": \"stream\","," \"text\": ["," \"User : 那宫保鸡丁呢?\\n\","," \"Assistant : 宫保鸡丁的制作方法也比较简单。首先,将鸡肉切成小丁状,用料酒、盐、生抽腌制一下。然后将青椒、红椒、葱姜蒜切成丁状备用。接着,将花生米炒香备用。\\n\","," \"\\n\","," \"热锅凉油,油温七成热时放入鸡丁煸炒至变色,捞出备用。再将葱姜蒜爆香,加入青红椒丁翻炒一下,然后加入鸡丁,翻炒均匀。最后加入适量的糖、盐、醋、生抽、老抽、料酒、水淀粉炒匀,最后加入炒香的花生米即可。\\n\","," \"\\n\","," \"需要注意的是,炒鸡丁的时候要用大火,这样鸡肉会更嫩。另外,调料的配比也很关键,需要根据个人口味适量调整。\\n\","," \"\\n\""," ]"," }"," ],"," \"source\": ["," \"prompt = summarized + \\\"\\\\n\\\\n请你根据已经聊了的内容,继续对话:\\\"\\n\","," \"conversation = Conversation(prompt, 5)\\n\","," \"\\n\","," \"question = \\\"那宫保鸡丁呢?\\\"\\n\","," \"answer = conversation.ask(question)\\n\","," \"print(\\\"User : %s\\\" % question)\\n\","," \"print(\\\"Assistant : %s\\\\n\\\" % answer)\""," ]"," },"," {"," \"cell_type\": \"code\","," \"execution_count\": 109,"," \"metadata\": {},"," \"outputs\": ["," {"," \"name\": \"stdout\","," \"output_type\": \"stream\","," \"text\": ["," \"User : 那宫保鸡丁呢?\\n\","," \"Assistant : 宫保鸡丁是一道非常有名的川菜,口感麻辣鲜香,非常美味。你喜欢吃辣的食物吗?\\n\","," \"\\n\""," ]"," }"," ],"," \"source\": ["," \"conversation = Conversation(\\\"请你根据已经聊了的内容,继续对话:\\\", 5)\\n\","," \"\\n\","," \"question = \\\"那宫保鸡丁呢?\\\"\\n\","," \"answer = conversation.ask(question)\\n\","," \"print(\\\"User : %s\\\" % question)\\n\","," \"print(\\\"Assistant : %s\\\\n\\\" % answer)\""," ]"," },"," {"," \"cell_type\": \"markdown\","," \"metadata\": {},"," \"source\": []"," }"," ],"," \"metadata\": {"," \"kernelspec\": {"," \"display_name\": \"Python 3 (ipykernel)\","," \"language\": \"python\","," \"name\": \"python3\""," },"," \"language_info\": {"," \"codemirror_mode\": {"," \"name\": \"ipython\","," \"version\": 3"," },"," \"file_extension\": \".py\","," \"mimetype\": \"text/x-python\","," \"name\": \"python\","," \"nbconvert_exporter\": \"python\","," \"pygments_lexer\": \"ipython3\","," \"version\": \"3.10.9\""," },"," \"vscode\": {"," \"interpreter\": {"," \"hash\": \"8114e84f04cf14e493992e1b725447accf84073d5ec18e7063d492738bf032cb\""," }"," }"," },"," \"nbformat\": 4,"," \"nbformat_minor\": 4","}"],"stylingDirectives":null,"colorizedLines":null,"csv":null,"csvError":null,"dependabotInfo":{"showConfigurationBanner":false,"configFilePath":null,"networkDependabotPath":"/xuwenhao/geektime-ai-course/network/updates","dismissConfigurationNoticePath":"/settings/dismiss-notice/dependabot_configuration_notice","configurationNoticeDismissed":null},"displayName":"07_clustering_and_summarize.ipynb","displayUrl":"https://notebooks.githubusercontent.com/view/ipynb?browser=unknown_browser\u0026bypass_fastly=true\u0026color_mode=auto\u0026commit=6825e05da76e1d435d66a7d96f02ef55ad1bfd7a\u0026device=unknown_device\u0026docs_host=https%3A%2F%2Fdocs.github.com\u0026enc_url=68747470733a2f2f7261772e67697468756275736572636f6e74656e742e636f6d2f787577656e68616f2f6765656b74696d652d61692d636f757273652f363832356530356461373665316434333564363661376439366630326566353561643162666437612f30375f636c7573746572696e675f616e645f73756d6d6172697a652e6970796e62\u0026logged_in=false\u0026nwo=xuwenhao%2Fgeektime-ai-course\u0026path=07_clustering_and_summarize.ipynb\u0026platform=unknown_platform\u0026repository_id=613680226\u0026repository_type=Repository\u0026version=0","headerInfo":{"blobSize":"36 KB","deleteTooltip":"You must be signed in to make or propose changes","editTooltip":"You must be signed in to make or propose changes","ghDesktopPath":"https://desktop.github.com","isGitLfs":false,"onBranch":true,"shortPath":"24c9792","siteNavLoginPath":"/login?return_to=https%3A%2F%2Fgithub.com%2Fxuwenhao%2Fgeektime-ai-course%2Fblob%2Fmain%2F07_clustering_and_summarize.ipynb","isCSV":false,"isRichtext":false,"toc":null,"lineInfo":{"truncatedLoc":"866","truncatedSloc":"866"},"mode":"file"},"image":false,"isCodeownersFile":null,"isPlain":false,"isValidLegacyIssueTemplate":false,"issueTemplate":null,"discussionTemplate":null,"language":"Jupyter Notebook","languageID":185,"large":false,"planSupportInfo":{"repoIsFork":null,"repoOwnedByCurrentUser":null,"requestFullPath":"/xuwenhao/geektime-ai-course/blob/main/07_clustering_and_summarize.ipynb","showFreeOrgGatedFeatureMessage":null,"showPlanSupportBanner":null,"upgradeDataAttributes":null,"upgradePath":null},"publishBannersInfo":{"dismissActionNoticePath":"/settings/dismiss-notice/publish_action_from_dockerfile","releasePath":"/xuwenhao/geektime-ai-course/releases/new?marketplace=true","showPublishActionBanner":false},"rawBlobUrl":"https://github.com/xuwenhao/geektime-ai-course/raw/refs/heads/main/07_clustering_and_summarize.ipynb","renderImageOrRaw":false,"richText":null,"renderedFileInfo":{"identityUUID":"c3c739ca-9cd4-4746-80c7-81291824bba8","renderFileType":"ipynb","size":36815},"shortPath":null,"symbolsEnabled":true,"tabSize":8,"topBannersInfo":{"overridingGlobalFundingFile":false,"globalPreferredFundingPath":null,"showInvalidCitationWarning":false,"citationHelpUrl":"https://docs.github.com/github/creating-cloning-and-archiving-repositories/creating-a-repository-on-github/about-citation-files","actionsOnboardingTip":null},"truncated":false,"viewable":true,"workflowRedirectUrl":null,"symbols":{"timed_out":false,"not_analyzed":true,"symbols":[]}},"copilotInfo":null,"copilotAccessAllowed":false,"modelsAccessAllowed":false,"modelsRepoIntegrationEnabled":false,"csrf_tokens":{"/xuwenhao/geektime-ai-course/branches":{"post":"CscpwlUfvTM-_o7OvGW28UwNufA4IOmq0S6xT7dRk1CW4xgIiLY9WFhywaMqZzzb7Rv-jXfnRHhoPnyquAw1gg"},"/repos/preferences":{"post":"EMeknqmzzyZXmJqoFkN-8LjJq5zl9wWnYNnZgD2JH4vI4EKfKqvM_ajoUKVp1sPn2TfrkppQ53-8lK57JNNccA"}}},"title":"geektime-ai-course/07_clustering_and_summarize.ipynb at main · xuwenhao/geektime-ai-course","appPayload":{"helpUrl":"https://docs.github.com","findFileWorkerPath":"/assets-cdn/worker/find-file-worker-7d7eb7c71814.js","findInFileWorkerPath":"/assets-cdn/worker/find-in-file-worker-1ae9fa256942.js","githubDevUrl":null,"enabled_features":{"code_nav_ui_events":false,"react_blob_overlay":false,"accessible_code_button":true,"github_models_repo_integration":false}}}
0