File size: 1,532 Bytes
ac05a26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac913dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac05a26
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import json
import numpy as np
import random
import uuid


def load_from_jsonl(filename, n=np.inf):
    data = []
    with open(filename, 'r') as file:
        for i, line in enumerate(file):
            if i >= n:  # stop after reading n lines
                break
            data.append(json.loads(line))
    return data


def append_id(conversations_no_id):
    conversations = []
    for conversation in conversations_no_id:
        conversations.append({
            'conv_id': uuid.uuid4().hex,
            'transcript': conversation['transcript']
        })
    return conversations


def save_to_jsonl(data, filename):
    with open(filename, 'w') as file:
        for item in data:
            json_line = json.dumps(item)
            file.write(json_line + '\n')


def get_conversation(data, min_length=0):
    conv = random.choice(data)
    transcript = conv['transcript']
    slice_index = random.randint(min_length, len(transcript) - 1)
    conv_slice = transcript[slice_index]
    return {
        'conv_id': conv['conv_id'],
        'slice_idx': slice_index,
        'transcript': conv_slice
    }


# def pad_transcript(transcript, max_length):
#     padding_count = max_length - len(transcript)
#     if padding_count > 0:
#         for _ in range(padding_count):
#             transcript.append({'speaker': '', 'response': ''})
#     return transcript


def get_last_response(transcript):
    for turn in reversed(transcript):
        if turn['speaker'] and turn['response']:
            return turn['response']