Spaces:

soojeongcrystal
/

hybridRAG

Sleeping

File size: 11,327 Bytes

1ec6c27
 
3db0045
 
1ff04a5
402b304
2f93eb1
 
1ec6c27
9b227e3
 
 
0352e69
afea222
2f93eb1
0352e69
160f223
2f93eb1
 
 
 
 
 
 
 
402b304
afea222
65f9910
 
 
 
 
1fe42c9
65f9910
 
 
 
 
afea222
65f9910
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a733b2
0352e69
 
ca15903
 
0352e69
 
 
 
702c25c
 
 
 
 
 
 
0352e69
 
 
 
1fe42c9
 
 
 
 
 
 
 
 
 
 
 
 
 
0352e69
 
 
 
1fe42c9
 
0352e69
 
 
 
 
558d69b
0352e69
afea222
2f93eb1
afea222
0352e69
1ec6c27
 
 
65f9910
 
558d69b
65f9910
 
 
3db0045
 
 
 
 
0352e69
 
 
 
 
 
1ec6c27
0352e69
3db0045
1ec6c27
0352e69
3db0045
6ec37d8
65f9910
0352e69
1fe42c9
 
 
 
 
1ec6c27
2f93eb1
 
 
 
1ec6c27
65f9910
1fe42c9
0352e69
 
1ec6c27
65f9910
0352e69
 
 
1ff04a5
0352e69
1ec6c27
afea222
7b0fd85
2f93eb1
 
402b304
9b227e3
0352e69
1ec6c27
2f93eb1
ca15903
afea222
 
 
 
 
 
 
 
0352e69
afea222
 
 
9d9c526
 
2f93eb1
 
9d9c526
3db0045
2e6bb20
0352e69
5ad04e8
1ec6c27
65f9910
0352e69
5ad04e8
 
0352e69
 
 
 
 
 
 
 
 
 
5ad04e8
9b227e3
7b0fd85
777fd1a
1ff04a5
558d69b
afea222
 
 
 
558d69b
7a733b2
777fd1a
 
558d69b
777fd1a
402b304
7b0fd85
9d9c526
7b0fd85
afea222
 
 
 
2f93eb1
 
 
 
 
 
 
 
 
777fd1a
1ec6c27

import gradio as gr
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import csv
import io
import tempfile
import os

# 한국어 처리를 위한 KoSentence-BERT 모델 로드
model = SentenceTransformer('jhgan/ko-sbert-sts')

# 전역 변수
global_recommendations = None
global_csv_file = None
youtube_columns = None

# CSV 파일 생성 함수
def create_csv_file(recommendations):
    with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.csv', encoding='utf-8') as temp_file:
        writer = csv.writer(temp_file)
        writer.writerow(["Employee ID", "Employee Name", "Recommended Programs", "Recommended YouTube Content"])
        for rec in recommendations:
            writer.writerow(rec)
    return temp_file.name

# 열 매칭 함수
def auto_match_columns(df, required_cols):
    matched_cols = {}
    for req_col in required_cols:
        matched_col = None
        for col in df.columns:
            if req_col.lower() in col.lower():
                matched_col = col
                break
        matched_cols[req_col] = matched_col
    return matched_cols

# 열 검증 함수
def validate_and_get_columns(employee_df, program_df):
    required_employee_cols = ["employee_id", "employee_name", "current_skills"]
    required_program_cols = ["program_name", "skills_acquired", "duration"]

    employee_cols = auto_match_columns(employee_df, required_employee_cols)
    program_cols = auto_match_columns(program_df, required_program_cols)
    
    for key, value in employee_cols.items():
        if value is None:
            return f"직원 데이터에서 '{key}' 열을 선택할 수 없습니다. 올바른 열을 선택하세요.", None, None
    
    for key, value in program_cols.items():
        if value is None:
            return f"프로그램 데이터에서 '{key}' 열을 선택할 수 없습니다. 올바른 열을 선택하세요.", None, None

    return None, employee_cols, program_cols

# 유튜브 데이터 열 선택 함수
def select_youtube_columns(youtube_file):
    global youtube_columns
    if youtube_file is None:
        return [gr.Dropdown(choices=[], value="") for _ in range(4)]
    youtube_df = pd.read_csv(youtube_file.name)
    required_youtube_cols = ["title", "description", "url", "upload_date"]
    youtube_columns = auto_match_columns(youtube_df, required_youtube_cols)
    
    column_options = youtube_df.columns.tolist()
    return [
        gr.Dropdown(choices=column_options, value=youtube_columns.get("title", "")),
        gr.Dropdown(choices=column_options, value=youtube_columns.get("description", "")),
        gr.Dropdown(choices=column_options, value=youtube_columns.get("url", "")),
        gr.Dropdown(choices=column_options, value=youtube_columns.get("upload_date", ""))
    ]

# 유튜브 콘텐츠 데이터 로드 및 처리 함수
def load_youtube_content(file_path, title_col, description_col, url_col, upload_date_col):
    youtube_df = pd.read_csv(file_path)
    selected_columns = [col for col in [title_col, description_col, url_col, upload_date_col] if col]
    youtube_df = youtube_df[selected_columns]
    
    column_mapping = {
        title_col: 'title',
        description_col: 'description',
        url_col: 'url',
        upload_date_col: 'upload_date'
    }
    youtube_df.rename(columns=column_mapping, inplace=True)
    
    if 'upload_date' in youtube_df.columns:
        youtube_df['upload_date'] = pd.to_datetime(youtube_df['upload_date'], errors='coerce')
    
    return youtube_df

# 유튜브 콘텐츠와 교육 프로그램 매칭 함수
def match_youtube_content(program_skills, youtube_df, model):
    if 'description' not in youtube_df.columns:
        return None
    youtube_embeddings = model.encode(youtube_df['description'].tolist())
    program_embeddings = model.encode(program_skills)
    similarities = cosine_similarity(program_embeddings, youtube_embeddings)
    return similarities

# 직원 데이터를 분석하여 교육 프로그램을 추천하고, 테이블을 생성하는 함수
def hybrid_rag(employee_file, program_file, youtube_file, title_col, description_col, url_col, upload_date_col):
    global global_recommendations
    global global_csv_file
    
    # 직원 및 프로그램 데이터 로드
    employee_df = pd.read_csv(employee_file.name)
    program_df = pd.read_csv(program_file.name)

    error_msg, employee_cols, program_cols = validate_and_get_columns(employee_df, program_df)
    if error_msg:
        return error_msg, None, None

    employee_skills = employee_df[employee_cols["current_skills"]].tolist()
    program_skills = program_df[program_cols["skills_acquired"]].tolist()
    employee_embeddings = model.encode(employee_skills)
    program_embeddings = model.encode(program_skills)

    similarities = cosine_similarity(employee_embeddings, program_embeddings)

    # 유튜브 콘텐츠 로드 및 처리
    youtube_df = load_youtube_content(youtube_file.name, title_col, description_col, url_col, upload_date_col)
    
    # 유튜브 콘텐츠와 교육 프로그램 매칭
    youtube_similarities = match_youtube_content(program_df[program_cols['skills_acquired']].tolist(), youtube_df, model)
    
    recommendations = []
    recommendation_rows = []
    for i, employee in employee_df.iterrows():
        recommended_programs = []
        recommended_youtube = []
        for j, program in program_df.iterrows():
            if similarities[i][j] > 0.5:
                recommended_programs.append(f"{program[program_cols['program_name']]} ({program[program_cols['duration']]})")
                
                if youtube_similarities is not None:
                    top_youtube_indices = youtube_similarities[j].argsort()[-3:][::-1]  # 상위 3개
                    for idx in top_youtube_indices:
                        if 'title' in youtube_df.columns and 'url' in youtube_df.columns:
                            recommended_youtube.append(f"{youtube_df.iloc[idx]['title']} (URL: {youtube_df.iloc[idx]['url']})")
        
        # 추천 프로그램 및 유튜브 콘텐츠 개수 제한
        recommended_programs = recommended_programs[:5]  # 최대 5개 프로그램만 추천
        recommended_youtube = recommended_youtube[:3]  # 최대 3개 유튜브 콘텐츠만 추천
        
        if recommended_programs:
            recommendation = f"직원 {employee[employee_cols['employee_name']]}의 추천 프로그램: {', '.join(recommended_programs)}"
            youtube_recommendation = f"추천 유튜브 콘텐츠: {', '.join(recommended_youtube)}" if recommended_youtube else "추천할 유튜브 콘텐츠가 없습니다."
            recommendation_rows.append([employee[employee_cols['employee_id']], employee[employee_cols['employee_name']], 
                                        ", ".join(recommended_programs), ", ".join(recommended_youtube)])
        else:
            recommendation = f"직원 {employee[employee_cols['employee_name']]}에게 적합한 프로그램이 없습니다."
            youtube_recommendation = "추천할 유튜브 콘텐츠가 없습니다."
            recommendation_rows.append([employee[employee_cols['employee_id']], employee[employee_cols['employee_name']], 
                                        "적합한 프로그램 없음", "추천 콘텐츠 없음"])

        recommendations.append(recommendation + "\n" + youtube_recommendation)

    global_recommendations = recommendation_rows
    
    # CSV 파일 생성
    global_csv_file = create_csv_file(recommendation_rows)
    
    # 결과 테이블 데이터프레임 생성
    result_df = pd.DataFrame(recommendation_rows, columns=["Employee ID", "Employee Name", "Recommended Programs", "Recommended YouTube Content"])

    return result_df, gr.File(value=global_csv_file, visible=True), gr.Button(value="CSV 다운로드", visible=True)
    
# 채팅 응답 함수
def chat_response(message, history):
    global global_recommendations
    if global_recommendations is None:
        return "먼저 '분석 시작' 버튼을 눌러 데이터를 분석해주세요."
    
    for employee in global_recommendations:
        if employee[1].lower() in message.lower():
            return f"{employee[1]}님에게 추천된 프로그램은 다음과 같습니다: {employee[2]}\n\n추천 유튜브 콘텐츠: {employee[3]}"
    
    return "죄송합니다. 해당 직원의 정보를 찾을 수 없습니다. 다른 직원 이름을 입력해주세요."

# CSV 다운로드 함수
def download_csv():
    global global_csv_file
    return gr.File(value=global_csv_file, visible=True)

# Gradio 블록
with gr.Blocks(css=".gradio-button {background-color: #007bff; color: white;} .gradio-textbox {border-color: #6c757d;}") as demo:
    gr.Markdown("<h1 style='text-align: center; color: #2c3e50;'>💼 HybridRAG 시스템 (유튜브 콘텐츠 포함)</h1>")
    
    with gr.Row():
        with gr.Column(scale=1, min_width=300):
            gr.Markdown("<h3 style='color: #34495e;'>1. 데이터를 업로드하세요</h3>")
            employee_file = gr.File(label="직원 데이터 업로드", interactive=True)
            program_file = gr.File(label="교육 프로그램 데이터 업로드", interactive=True)
            youtube_file = gr.File(label="유튜브 콘텐츠 데이터 업로드", interactive=True)
            
            gr.Markdown("<h4 style='color: #34495e;'>유튜브 데이터 열 선택</h4>")
            title_col = gr.Dropdown(label="제목 열")
            description_col = gr.Dropdown(label="설명 열")
            url_col = gr.Dropdown(label="URL 열")
            upload_date_col = gr.Dropdown(label="업로드 날짜 열")
            
            youtube_file.change(select_youtube_columns, inputs=[youtube_file], outputs=[title_col, description_col, url_col, upload_date_col])
            
            analyze_button = gr.Button("분석 시작", elem_classes="gradio-button")
            output_table = gr.DataFrame(label="분석 결과 (테이블)")
            csv_download = gr.File(label="추천 결과 다운로드", visible=False)
            download_button = gr.Button("CSV 다운로드", visible=False)

    gr.Markdown("<h3 style='color: #34495e;'>2. 직원별 추천 프로그램 및 유튜브 콘텐츠 확인</h3>")
    chatbot = gr.Chatbot()
    msg = gr.Textbox(label="직원 이름을 입력하세요")
    clear = gr.Button("대화 내역 지우기")

    # 분석 버튼 클릭 시 테이블, 파일 다운로드를 업데이트
    analyze_button.click(
        hybrid_rag, 
        inputs=[employee_file, program_file, youtube_file, title_col, description_col, url_col, upload_date_col], 
        outputs=[output_table, csv_download, download_button]
    )

    # CSV 다운로드 버튼
    download_button.click(download_csv, inputs=[], outputs=[csv_download])

    # 채팅 기능
    msg.submit(chat_response, [msg, chatbot], [chatbot])
    clear.click(lambda: None, None, chatbot, queue=False)

# 프로그램 종료 시 임시 파일 삭제
import atexit

@atexit.register
def cleanup():
    global global_csv_file
    if global_csv_file and os.path.exists(global_csv_file):
        os.remove(global_csv_file)

# Gradio 인터페이스 실행
demo.launch()