File size: 11,327 Bytes
1ec6c27
 
3db0045
 
1ff04a5
402b304
2f93eb1
 
1ec6c27
9b227e3
 
 
0352e69
afea222
2f93eb1
0352e69
160f223
2f93eb1
 
 
 
 
 
 
 
402b304
afea222
65f9910
 
 
 
 
1fe42c9
65f9910
 
 
 
 
afea222
65f9910
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a733b2
0352e69
 
ca15903
 
0352e69
 
 
 
702c25c
 
 
 
 
 
 
0352e69
 
 
 
1fe42c9
 
 
 
 
 
 
 
 
 
 
 
 
 
0352e69
 
 
 
1fe42c9
 
0352e69
 
 
 
 
558d69b
0352e69
afea222
2f93eb1
afea222
0352e69
1ec6c27
 
 
65f9910
 
558d69b
65f9910
 
 
3db0045
 
 
 
 
0352e69
 
 
 
 
 
1ec6c27
0352e69
3db0045
1ec6c27
0352e69
3db0045
6ec37d8
65f9910
0352e69
1fe42c9
 
 
 
 
1ec6c27
2f93eb1
 
 
 
1ec6c27
65f9910
1fe42c9
0352e69
 
1ec6c27
65f9910
0352e69
 
 
1ff04a5
0352e69
1ec6c27
afea222
7b0fd85
2f93eb1
 
402b304
9b227e3
0352e69
1ec6c27
2f93eb1
ca15903
afea222
 
 
 
 
 
 
 
0352e69
afea222
 
 
9d9c526
 
2f93eb1
 
9d9c526
3db0045
2e6bb20
0352e69
5ad04e8
1ec6c27
65f9910
0352e69
5ad04e8
 
0352e69
 
 
 
 
 
 
 
 
 
5ad04e8
9b227e3
7b0fd85
777fd1a
1ff04a5
558d69b
afea222
 
 
 
558d69b
7a733b2
777fd1a
 
558d69b
777fd1a
402b304
7b0fd85
9d9c526
7b0fd85
afea222
 
 
 
2f93eb1
 
 
 
 
 
 
 
 
777fd1a
1ec6c27
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
import gradio as gr
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import csv
import io
import tempfile
import os

# ν•œκ΅­μ–΄ 처리λ₯Ό μœ„ν•œ KoSentence-BERT λͺ¨λΈ λ‘œλ“œ
model = SentenceTransformer('jhgan/ko-sbert-sts')

# μ „μ—­ λ³€μˆ˜
global_recommendations = None
global_csv_file = None
youtube_columns = None

# CSV 파일 생성 ν•¨μˆ˜
def create_csv_file(recommendations):
    with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.csv', encoding='utf-8') as temp_file:
        writer = csv.writer(temp_file)
        writer.writerow(["Employee ID", "Employee Name", "Recommended Programs", "Recommended YouTube Content"])
        for rec in recommendations:
            writer.writerow(rec)
    return temp_file.name

# μ—΄ 맀칭 ν•¨μˆ˜
def auto_match_columns(df, required_cols):
    matched_cols = {}
    for req_col in required_cols:
        matched_col = None
        for col in df.columns:
            if req_col.lower() in col.lower():
                matched_col = col
                break
        matched_cols[req_col] = matched_col
    return matched_cols

# μ—΄ 검증 ν•¨μˆ˜
def validate_and_get_columns(employee_df, program_df):
    required_employee_cols = ["employee_id", "employee_name", "current_skills"]
    required_program_cols = ["program_name", "skills_acquired", "duration"]

    employee_cols = auto_match_columns(employee_df, required_employee_cols)
    program_cols = auto_match_columns(program_df, required_program_cols)
    
    for key, value in employee_cols.items():
        if value is None:
            return f"직원 λ°μ΄ν„°μ—μ„œ '{key}' 열을 선택할 수 μ—†μŠ΅λ‹ˆλ‹€. μ˜¬λ°”λ₯Έ 열을 μ„ νƒν•˜μ„Έμš”.", None, None
    
    for key, value in program_cols.items():
        if value is None:
            return f"ν”„λ‘œκ·Έλž¨ λ°μ΄ν„°μ—μ„œ '{key}' 열을 선택할 수 μ—†μŠ΅λ‹ˆλ‹€. μ˜¬λ°”λ₯Έ 열을 μ„ νƒν•˜μ„Έμš”.", None, None

    return None, employee_cols, program_cols

# 유튜브 데이터 μ—΄ 선택 ν•¨μˆ˜
def select_youtube_columns(youtube_file):
    global youtube_columns
    if youtube_file is None:
        return [gr.Dropdown(choices=[], value="") for _ in range(4)]
    youtube_df = pd.read_csv(youtube_file.name)
    required_youtube_cols = ["title", "description", "url", "upload_date"]
    youtube_columns = auto_match_columns(youtube_df, required_youtube_cols)
    
    column_options = youtube_df.columns.tolist()
    return [
        gr.Dropdown(choices=column_options, value=youtube_columns.get("title", "")),
        gr.Dropdown(choices=column_options, value=youtube_columns.get("description", "")),
        gr.Dropdown(choices=column_options, value=youtube_columns.get("url", "")),
        gr.Dropdown(choices=column_options, value=youtube_columns.get("upload_date", ""))
    ]

# 유튜브 μ½˜ν…μΈ  데이터 λ‘œλ“œ 및 처리 ν•¨μˆ˜
def load_youtube_content(file_path, title_col, description_col, url_col, upload_date_col):
    youtube_df = pd.read_csv(file_path)
    selected_columns = [col for col in [title_col, description_col, url_col, upload_date_col] if col]
    youtube_df = youtube_df[selected_columns]
    
    column_mapping = {
        title_col: 'title',
        description_col: 'description',
        url_col: 'url',
        upload_date_col: 'upload_date'
    }
    youtube_df.rename(columns=column_mapping, inplace=True)
    
    if 'upload_date' in youtube_df.columns:
        youtube_df['upload_date'] = pd.to_datetime(youtube_df['upload_date'], errors='coerce')
    
    return youtube_df

# 유튜브 μ½˜ν…μΈ μ™€ ꡐ윑 ν”„λ‘œκ·Έλž¨ 맀칭 ν•¨μˆ˜
def match_youtube_content(program_skills, youtube_df, model):
    if 'description' not in youtube_df.columns:
        return None
    youtube_embeddings = model.encode(youtube_df['description'].tolist())
    program_embeddings = model.encode(program_skills)
    similarities = cosine_similarity(program_embeddings, youtube_embeddings)
    return similarities

# 직원 데이터λ₯Ό λΆ„μ„ν•˜μ—¬ ꡐ윑 ν”„λ‘œκ·Έλž¨μ„ μΆ”μ²œν•˜κ³ , ν…Œμ΄λΈ”μ„ μƒμ„±ν•˜λŠ” ν•¨μˆ˜
def hybrid_rag(employee_file, program_file, youtube_file, title_col, description_col, url_col, upload_date_col):
    global global_recommendations
    global global_csv_file
    
    # 직원 및 ν”„λ‘œκ·Έλž¨ 데이터 λ‘œλ“œ
    employee_df = pd.read_csv(employee_file.name)
    program_df = pd.read_csv(program_file.name)

    error_msg, employee_cols, program_cols = validate_and_get_columns(employee_df, program_df)
    if error_msg:
        return error_msg, None, None

    employee_skills = employee_df[employee_cols["current_skills"]].tolist()
    program_skills = program_df[program_cols["skills_acquired"]].tolist()
    employee_embeddings = model.encode(employee_skills)
    program_embeddings = model.encode(program_skills)

    similarities = cosine_similarity(employee_embeddings, program_embeddings)

    # 유튜브 μ½˜ν…μΈ  λ‘œλ“œ 및 처리
    youtube_df = load_youtube_content(youtube_file.name, title_col, description_col, url_col, upload_date_col)
    
    # 유튜브 μ½˜ν…μΈ μ™€ ꡐ윑 ν”„λ‘œκ·Έλž¨ 맀칭
    youtube_similarities = match_youtube_content(program_df[program_cols['skills_acquired']].tolist(), youtube_df, model)
    
    recommendations = []
    recommendation_rows = []
    for i, employee in employee_df.iterrows():
        recommended_programs = []
        recommended_youtube = []
        for j, program in program_df.iterrows():
            if similarities[i][j] > 0.5:
                recommended_programs.append(f"{program[program_cols['program_name']]} ({program[program_cols['duration']]})")
                
                if youtube_similarities is not None:
                    top_youtube_indices = youtube_similarities[j].argsort()[-3:][::-1]  # μƒμœ„ 3개
                    for idx in top_youtube_indices:
                        if 'title' in youtube_df.columns and 'url' in youtube_df.columns:
                            recommended_youtube.append(f"{youtube_df.iloc[idx]['title']} (URL: {youtube_df.iloc[idx]['url']})")
        
        # μΆ”μ²œ ν”„λ‘œκ·Έλž¨ 및 유튜브 μ½˜ν…μΈ  개수 μ œν•œ
        recommended_programs = recommended_programs[:5]  # μ΅œλŒ€ 5개 ν”„λ‘œκ·Έλž¨λ§Œ μΆ”μ²œ
        recommended_youtube = recommended_youtube[:3]  # μ΅œλŒ€ 3개 유튜브 μ½˜ν…μΈ λ§Œ μΆ”μ²œ
        
        if recommended_programs:
            recommendation = f"직원 {employee[employee_cols['employee_name']]}의 μΆ”μ²œ ν”„λ‘œκ·Έλž¨: {', '.join(recommended_programs)}"
            youtube_recommendation = f"μΆ”μ²œ 유튜브 μ½˜ν…μΈ : {', '.join(recommended_youtube)}" if recommended_youtube else "μΆ”μ²œν•  유튜브 μ½˜ν…μΈ κ°€ μ—†μŠ΅λ‹ˆλ‹€."
            recommendation_rows.append([employee[employee_cols['employee_id']], employee[employee_cols['employee_name']], 
                                        ", ".join(recommended_programs), ", ".join(recommended_youtube)])
        else:
            recommendation = f"직원 {employee[employee_cols['employee_name']]}μ—κ²Œ μ ν•©ν•œ ν”„λ‘œκ·Έλž¨μ΄ μ—†μŠ΅λ‹ˆλ‹€."
            youtube_recommendation = "μΆ”μ²œν•  유튜브 μ½˜ν…μΈ κ°€ μ—†μŠ΅λ‹ˆλ‹€."
            recommendation_rows.append([employee[employee_cols['employee_id']], employee[employee_cols['employee_name']], 
                                        "μ ν•©ν•œ ν”„λ‘œκ·Έλž¨ μ—†μŒ", "μΆ”μ²œ μ½˜ν…μΈ  μ—†μŒ"])

        recommendations.append(recommendation + "\n" + youtube_recommendation)

    global_recommendations = recommendation_rows
    
    # CSV 파일 생성
    global_csv_file = create_csv_file(recommendation_rows)
    
    # κ²°κ³Ό ν…Œμ΄λΈ” λ°μ΄ν„°ν”„λ ˆμž„ 생성
    result_df = pd.DataFrame(recommendation_rows, columns=["Employee ID", "Employee Name", "Recommended Programs", "Recommended YouTube Content"])

    return result_df, gr.File(value=global_csv_file, visible=True), gr.Button(value="CSV λ‹€μš΄λ‘œλ“œ", visible=True)
    
# μ±„νŒ… 응닡 ν•¨μˆ˜
def chat_response(message, history):
    global global_recommendations
    if global_recommendations is None:
        return "λ¨Όμ € '뢄석 μ‹œμž‘' λ²„νŠΌμ„ 눌러 데이터λ₯Ό λΆ„μ„ν•΄μ£Όμ„Έμš”."
    
    for employee in global_recommendations:
        if employee[1].lower() in message.lower():
            return f"{employee[1]}λ‹˜μ—κ²Œ μΆ”μ²œλœ ν”„λ‘œκ·Έλž¨μ€ λ‹€μŒκ³Ό κ°™μŠ΅λ‹ˆλ‹€: {employee[2]}\n\nμΆ”μ²œ 유튜브 μ½˜ν…μΈ : {employee[3]}"
    
    return "μ£„μ†‘ν•©λ‹ˆλ‹€. ν•΄λ‹Ή μ§μ›μ˜ 정보λ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. λ‹€λ₯Έ 직원 이름을 μž…λ ₯ν•΄μ£Όμ„Έμš”."

# CSV λ‹€μš΄λ‘œλ“œ ν•¨μˆ˜
def download_csv():
    global global_csv_file
    return gr.File(value=global_csv_file, visible=True)

# Gradio 블둝
with gr.Blocks(css=".gradio-button {background-color: #007bff; color: white;} .gradio-textbox {border-color: #6c757d;}") as demo:
    gr.Markdown("<h1 style='text-align: center; color: #2c3e50;'>πŸ’Ό HybridRAG μ‹œμŠ€ν…œ (유튜브 μ½˜ν…μΈ  포함)</h1>")
    
    with gr.Row():
        with gr.Column(scale=1, min_width=300):
            gr.Markdown("<h3 style='color: #34495e;'>1. 데이터λ₯Ό μ—…λ‘œλ“œν•˜μ„Έμš”</h3>")
            employee_file = gr.File(label="직원 데이터 μ—…λ‘œλ“œ", interactive=True)
            program_file = gr.File(label="ꡐ윑 ν”„λ‘œκ·Έλž¨ 데이터 μ—…λ‘œλ“œ", interactive=True)
            youtube_file = gr.File(label="유튜브 μ½˜ν…μΈ  데이터 μ—…λ‘œλ“œ", interactive=True)
            
            gr.Markdown("<h4 style='color: #34495e;'>유튜브 데이터 μ—΄ 선택</h4>")
            title_col = gr.Dropdown(label="제λͺ© μ—΄")
            description_col = gr.Dropdown(label="μ„€λͺ… μ—΄")
            url_col = gr.Dropdown(label="URL μ—΄")
            upload_date_col = gr.Dropdown(label="μ—…λ‘œλ“œ λ‚ μ§œ μ—΄")
            
            youtube_file.change(select_youtube_columns, inputs=[youtube_file], outputs=[title_col, description_col, url_col, upload_date_col])
            
            analyze_button = gr.Button("뢄석 μ‹œμž‘", elem_classes="gradio-button")
            output_table = gr.DataFrame(label="뢄석 κ²°κ³Ό (ν…Œμ΄λΈ”)")
            csv_download = gr.File(label="μΆ”μ²œ κ²°κ³Ό λ‹€μš΄λ‘œλ“œ", visible=False)
            download_button = gr.Button("CSV λ‹€μš΄λ‘œλ“œ", visible=False)

    gr.Markdown("<h3 style='color: #34495e;'>2. 직원별 μΆ”μ²œ ν”„λ‘œκ·Έλž¨ 및 유튜브 μ½˜ν…μΈ  확인</h3>")
    chatbot = gr.Chatbot()
    msg = gr.Textbox(label="직원 이름을 μž…λ ₯ν•˜μ„Έμš”")
    clear = gr.Button("λŒ€ν™” λ‚΄μ—­ μ§€μš°κΈ°")

    # 뢄석 λ²„νŠΌ 클릭 μ‹œ ν…Œμ΄λΈ”, 파일 λ‹€μš΄λ‘œλ“œλ₯Ό μ—…λ°μ΄νŠΈ
    analyze_button.click(
        hybrid_rag, 
        inputs=[employee_file, program_file, youtube_file, title_col, description_col, url_col, upload_date_col], 
        outputs=[output_table, csv_download, download_button]
    )

    # CSV λ‹€μš΄λ‘œλ“œ λ²„νŠΌ
    download_button.click(download_csv, inputs=[], outputs=[csv_download])

    # μ±„νŒ… κΈ°λŠ₯
    msg.submit(chat_response, [msg, chatbot], [chatbot])
    clear.click(lambda: None, None, chatbot, queue=False)

# ν”„λ‘œκ·Έλž¨ μ’…λ£Œ μ‹œ μž„μ‹œ 파일 μ‚­μ œ
import atexit

@atexit.register
def cleanup():
    global global_csv_file
    if global_csv_file and os.path.exists(global_csv_file):
        os.remove(global_csv_file)

# Gradio μΈν„°νŽ˜μ΄μŠ€ μ‹€ν–‰
demo.launch()