import json import numpy as np import random import uuid def load_from_jsonl(filename, n=np.inf): data = [] with open(filename, 'r') as file: for i, line in enumerate(file): if i >= n: # stop after reading n lines break data.append(json.loads(line)) return data def append_id(conversations_no_id): conversations = [] for conversation in conversations_no_id: conversations.append({ 'conv_id': uuid.uuid4().hex, 'transcript': conversation['transcript'] }) return conversations def save_to_jsonl(data, filename): with open(filename, 'w') as file: for item in data: json_line = json.dumps(item) file.write(json_line + '\n') def get_conversation(data, min_length=0): conv = random.choice(data) transcript = conv['transcript'] slice_index = random.randint(min_length, len(transcript) - 1) conv_slice = transcript[slice_index] return { 'conv_id': conv['conv_id'], 'slice_idx': slice_index, 'transcript': conv_slice } # def pad_transcript(transcript, max_length): # padding_count = max_length - len(transcript) # if padding_count > 0: # for _ in range(padding_count): # transcript.append({'speaker': '', 'response': ''}) # return transcript def get_last_response(transcript): for turn in reversed(transcript): if turn['speaker'] and turn['response']: return turn['response']