File size: 2,237 Bytes
519c766
 
 
30d3f5d
519c766
30d3f5d
ec8fdf0
519c766
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30d3f5d
519c766
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import re

import gradio as gr
from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("mamiksik/CodeBERTa-commit-message-autocomplete")
model = AutoModelForMaskedLM.from_pretrained("mamiksik/CodeBERTa-commit-message-autocomplete")
pipe = pipeline("fill-mask", model=model, tokenizer=tokenizer)


def parse_files(patch) -> str:
    accumulator = []
    lines = patch.splitlines()

    filename_before = None
    for line in lines:
        if line.startswith("index") or line.startswith("diff"):
            continue
        if line.startswith("---"):
            filename_before = line.split(" ", 1)[1][1:]
            continue

        if line.startswith("+++"):
            filename_after = line.split(" ", 1)[1][1:]

            if filename_before == filename_after:
                accumulator.append(f"<ide><path>{filename_before}")
            else:
                accumulator.append(f"<add><path>{filename_after}")
                accumulator.append(f"<del><path>{filename_before}")
            continue

        line = re.sub("@@[^@@]*@@", "", line)
        if len(line) == 0:
            continue

        if line[0] == "+":
            line = line.replace("+", "<add>", 1)
        elif line[0] == "-":
            line = line.replace("-", "<del>", 1)
        else:
            line = f"<ide>{line}"

        accumulator.append(line)

    return '\n'.join(accumulator)


def predict(patch, commit_message):
    input_text = parse_files(patch) + "</sep></sep><msg> " + commit_message + "</sep>"
    token_count = tokenizer(input_text, return_tensors="pt").input_ids.shape[1]
    result = pipe.predict(input_text)

    return token_count, input_text, {pred['token_str']: round(pred['score'], 3) for pred in result}


iface = gr.Interface(fn=predict, inputs=[
    gr.Textbox(label="Patch (as generated by git diff)"),
    gr.Textbox(label="Commit message (with one <mask> token)"),
], outputs=[
    gr.Textbox(label="Token count"),
    gr.Textbox(label="Parsed patch"),
    gr.Label(label="Predictions")
], examples=[["""
def main():
-    name = "John"
    print("Hello World")
""", "Remove <mask> variable"]
             ])

if __name__ == "__main__":
    iface.launch()