Joan Giner commited on
Commit
7afc738
1 Parent(s): 3b951ba
app.py CHANGED
@@ -27,8 +27,8 @@ LLMClient = OpenAI(model_name='text-davinci-003', openai_api_key=openai.api_key,
27
  def extract_text_from_pdf(file_path):
28
  article_dict = scipdf.parse_pdf_to_dict(file_path, soup=True,return_coordinates=False, grobid_url="https://kermitt2-grobid.hf.space") # return dictionary
29
  print("parsed")
30
- source = article_dict.find("sourcedesc")
31
- authors = source.find_all("persname")
32
  finaltext = article_dict['title'] + " \n\n " + article_dict['authors'] + " \n\n Abstract: " + article_dict['abstract'] + " \n\n "
33
  sections = []
34
  for section in article_dict['sections']:
 
27
  def extract_text_from_pdf(file_path):
28
  article_dict = scipdf.parse_pdf_to_dict(file_path, soup=True,return_coordinates=False, grobid_url="https://kermitt2-grobid.hf.space") # return dictionary
29
  print("parsed")
30
+ #source = article_dict.find("sourcedesc")
31
+ #authors = source.find_all("persname")
32
  finaltext = article_dict['title'] + " \n\n " + article_dict['authors'] + " \n\n Abstract: " + article_dict['abstract'] + " \n\n "
33
  sections = []
34
  for section in article_dict['sections']:
requirements.txt CHANGED
@@ -1,309 +1,122 @@
1
- absl-py==1.3.0
2
- accelerate==0.19.0
3
  aiofiles==23.1.0
4
  aiohttp==3.8.4
5
  aiosignal==1.3.1
6
- alembic==1.8.1
7
- altair==5.0.0
8
- anyio==3.6.2
9
- appdirs==1.4.4
10
- appnope==0.1.3
11
- asttokens==2.1.0
12
- astunparse==1.6.3
13
  async-timeout==4.0.2
 
14
  attrs==23.1.0
15
- azure-ai-formrecognizer==3.2.0
16
- azure-common==1.1.28
17
- azure-core==1.26.1
18
- backcall==0.2.0
19
- backoff==2.2.1
20
- beautifulsoup4==4.11.1
21
- bitsandbytes==0.39.0
22
  blis==0.7.9
23
- blobfile==2.0.0
24
- cachetools==5.2.0
25
- camelot-py==0.9.0
26
  catalogue==2.0.8
27
  certifi==2023.5.7
28
  cffi==1.15.1
29
- chardet==5.1.0
30
  charset-normalizer==3.1.0
31
  click==8.1.3
32
- cloudpickle==2.2.0
33
- cohere==4.1.3
34
  confection==0.0.4
35
- construct==2.5.3
36
  contourpy==1.0.7
37
- cryptography==40.0.2
38
  cycler==0.11.0
39
  cymem==2.0.7
40
- databricks-cli==0.17.3
41
  dataclasses-json==0.5.7
42
- datasets==2.7.1
43
- debugpy==1.6.3
44
- decorator==5.1.1
45
- dill==0.3.6
46
- Distance==0.1.3
47
- distlib==0.3.6
48
  distro==1.8.0
49
- docopt==0.6.2
50
- elasticsearch==7.17.7
51
  en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl
52
- entrypoints==0.4
53
- et-xmlfile==1.1.0
54
- executing==1.2.0
55
- extruct==0.14.0
56
- faiss-cpu==1.7.2
57
  fastapi==0.95.2
58
- fastjsonschema==2.16.2
59
  ffmpy==0.3.0
60
  filelock==3.12.0
61
- Flask==2.2.2
62
- flatbuffers==22.10.26
63
  fonttools==4.39.4
64
  frozenlist==1.3.3
65
  fsspec==2023.5.0
66
- future==0.18.2
67
- gast==0.4.0
68
- gitdb==4.0.9
69
- GitPython==3.1.29
70
- google-auth==2.14.1
71
- google-auth-oauthlib==0.4.6
72
- google-pasta==0.2.0
73
- gpt-index==0.5.12
74
- gradio==3.26.0
75
- gradio_client==0.1.2
76
- grobid-client-python==0.0.5
77
- grpcio==1.50.0
78
- gunicorn==20.1.0
79
  h11==0.14.0
80
- h5py==3.7.0
81
- html-text==0.5.2
82
- html5lib==1.1
83
- htmlmin==0.1.12
84
- httpcore==0.17.1
85
  httpx==0.24.1
86
  huggingface-hub==0.14.1
87
  idna==3.4
88
- ImageHash==4.3.1
89
- importlib-metadata==5.0.0
90
- inflect==6.0.2
91
- iniconfig==1.1.1
92
- ipykernel==6.17.1
93
- ipython==8.6.0
94
- ipywidgets==8.0.2
95
- isodate==0.6.1
96
- itsdangerous==2.1.2
97
- jaconv==0.3
98
- jamo==0.4.1
99
- jarowinkler==1.2.3
100
- jedi==0.18.1
101
  Jinja2==3.1.2
102
- joblib==1.2.0
103
  jsonschema==4.17.3
104
- jstyleson==0.0.2
105
- jupyter_client==7.4.7
106
- jupyter_core==5.0.0
107
- jupyterlab-widgets==3.0.3
108
- keras==2.10.0
109
- Keras-Preprocessing==1.1.2
110
  kiwisolver==1.4.4
111
- langchain==0.0.173
112
  langcodes==3.3.0
113
- langdetect==1.0.9
114
- libclang==14.0.6
115
  linkify-it-py==2.0.2
116
- llvmlite==0.39.1
117
- loralib==0.1.1
118
- lxml==4.9.1
119
- Mako==1.2.4
120
- Markdown==3.4.1
121
  markdown-it-py==2.2.0
122
- markdown2==2.4.6
123
  MarkupSafe==2.1.2
124
  marshmallow==3.19.0
125
  marshmallow-enum==1.5.1
126
- matplotlib==3.6.1
127
- matplotlib-inline==0.1.6
128
  mdit-py-plugins==0.3.3
129
  mdurl==0.1.2
130
- mf2py==1.1.2
131
- mistune==2.0.4
132
- mmh3==3.0.0
133
- monotonic==1.6
134
- more-itertools==9.0.0
135
- mpmath==1.2.1
136
- msgpack==1.0.4
137
- msrest==0.7.1
138
  multidict==6.0.4
139
- multimethod==1.9
140
- multiprocess==0.70.14
141
  murmurhash==1.0.9
142
- mypy-extensions==0.4.3
143
- nest-asyncio==1.5.6
144
- networkx==2.8.8
145
- nltk==3.7
146
- num2words==0.5.12
147
- numba==0.56.4
148
  numexpr==2.8.4
149
- numpy==1.23.1
150
- oauthlib==3.2.2
151
- openai==0.27.2
152
  openapi-schema-pydantic==1.2.4
153
- opencv-python==4.7.0.72
154
- openpyxl==3.0.10
155
- opt-einsum==3.3.0
156
- orjson==3.8.12
157
- outcome==1.2.0
158
- packaging==21.0
159
- pandas==2.0.1
160
- pandocfilters==1.5.0
161
- parso==0.8.3
162
- pathspec==0.10.2
163
  pathy==0.10.1
164
- patsy==0.5.3
165
- pdf2image==1.16.0
166
- pdfminer.six==20221105
167
- pefile==2022.5.30
168
- pexpect==4.8.0
169
- phik==0.12.3
170
- pickleshare==0.7.5
171
  Pillow==9.5.0
172
- platformdirs==2.5.4
173
- pluggy==1.0.0
174
- posthog==2.2.0
175
  preshed==3.0.8
176
- prompt-toolkit==3.0.32
177
- protobuf==3.19.6
178
- psutil==5.9.4
179
- psycopg2-binary==2.9.5
180
- ptyprocess==0.7.0
181
- pure-eval==0.2.2
182
- py==1.11.0
183
- py-cpuinfo==9.0.0
184
- pyarrow==10.0.0
185
- pyasn1==0.4.8
186
- pyasn1-modules==0.2.8
187
  pycparser==2.21
188
- pycryptodomex==3.16.0
189
- pydantic==1.10.7
190
  pydub==0.25.1
191
- Pygments==2.13.0
192
- PyJWT==2.6.0
193
  pyparsing==3.0.9
194
- PyPDF2==2.12.1
195
  pyphen==0.14.0
196
- pypinyin==0.44.0
197
- pyRdfa3==3.5.3
198
  pyrsistent==0.19.3
199
- PySocks==1.7.1
200
- pytesseract==0.3.10
201
  python-dateutil==2.8.2
202
- python-docx==0.8.11
203
- python-dotenv==0.21.0
204
- python-magic==0.4.27
205
  python-multipart==0.0.6
206
- python-ptrace==0.9.8
207
- pytorch-wpe==0.0.1
208
- pytrec-eval==0.5
209
  pytz==2023.3
210
- PyWavelets==1.4.1
211
  PyYAML==6.0
212
- pyzmq==24.0.1
213
- quantulum3==0.7.11
214
- querystring-parser==1.2.4
215
- rank-bm25==0.2.2
216
- rapidfuzz==2.7.0
217
- rdflib==6.2.0
218
- regex==2022.10.31
219
- requests==2.28.0
220
- requests-oauthlib==1.3.1
221
- responses==0.18.0
222
- rsa==4.9
223
- safetensors==0.3.1
224
- scikit-learn==1.1.3
225
- scipdf @ git+https://github.com/titipata/scipdf_parser@501cf2547320adca78a94e2b2ba83526e46bd82e
226
- scipy==1.9.3
227
- seaborn==0.12.1
228
  semantic-version==2.10.0
229
- sentence-transformers==2.2.2
230
- sentencepiece==0.1.97
231
- seqeval==1.2.2
232
- shap==0.41.0
233
  six==1.16.0
234
- slicer==0.0.7
235
  smart-open==6.3.0
236
- smmap==5.0.0
237
  sniffio==1.3.0
238
- sortedcontainers==2.4.0
239
- soupsieve==2.3.2.post1
240
  spacy==3.5.3
241
  spacy-legacy==3.0.12
242
  spacy-loggers==1.0.4
243
- SQLAlchemy==1.4.44
244
- SQLAlchemy-Utils==0.38.3
245
- sqlparse==0.4.3
246
  srsly==2.4.6
247
- stack-data==0.6.1
248
  starlette==0.27.0
249
- statsmodels==0.13.5
250
- sympy==1.11.1
251
  tabula-py==2.7.0
252
- tabulate==0.9.0
253
- tangled-up-in-unicode==0.2.0
254
  tenacity==8.2.2
255
- tensorboard==2.10.1
256
- tensorboard-data-server==0.6.1
257
- tensorboard-plugin-wit==1.8.1
258
- termcolor==2.1.0
259
  textstat==0.7.3
260
  thinc==8.1.10
261
- threadpoolctl==3.1.0
262
- tika==1.24
263
  tiktoken==0.4.0
264
- tinycss2==1.2.1
265
- tk==0.1.0
266
- tokenize-rt==5.0.0
267
- tokenizers==0.12.1
268
- toml==0.10.2
269
- tomli==2.0.1
270
- tomli_w==1.0.0
271
- tomlkit==0.11.6
272
  toolz==0.12.0
273
- torch==1.12.1
274
- torch-complex==0.4.3
275
- torchvision==0.14.0
276
- tornado==6.2
277
- tqdm==4.64.0
278
- traitlets==5.5.0
279
- transformers @ git+https://github.com/huggingface/transformers@main
280
- typeguard==2.13.3
281
  typer==0.7.0
282
- typing-inspect==0.8.0
283
- typing_extensions==4.5.0
284
- tzdata==2023.3
285
  uc-micro-py==1.0.2
286
- ujson==5.1.0
287
- Unidecode==1.3.6
288
- url-normalize==1.4.3
289
- urllib3~=1.25
290
  uvicorn==0.22.0
291
- validators==0.18.2
292
- virtualenv==20.16.7
293
- visions==0.7.5
294
- w3lib==2.1.1
295
  wasabi==1.1.1
296
- watchdog==2.1.9
297
- wcwidth==0.2.5
298
- webencodings==0.5.1
299
- websocket-client==1.4.2
300
  websockets==11.0.3
301
- Werkzeug==2.2.2
302
- widgetsnbextension==4.0.3
303
- wrapt==1.14.1
304
- xlwt==1.3.0
305
- xmltodict==0.13.0
306
- xxhash==3.1.0
307
- yapf==0.32.0
308
  yarl==1.9.2
309
- zipp==3.10.0
 
 
 
1
  aiofiles==23.1.0
2
  aiohttp==3.8.4
3
  aiosignal==1.3.1
4
+ altair==5.0.1
5
+ anyio==3.7.0
 
 
 
 
 
6
  async-timeout==4.0.2
7
+ asyncio==3.4.3
8
  attrs==23.1.0
9
+ beautifulsoup4==4.12.2
 
 
 
 
 
 
10
  blis==0.7.9
 
 
 
11
  catalogue==2.0.8
12
  certifi==2023.5.7
13
  cffi==1.15.1
 
14
  charset-normalizer==3.1.0
15
  click==8.1.3
 
 
16
  confection==0.0.4
 
17
  contourpy==1.0.7
18
+ cryptography==41.0.0
19
  cycler==0.11.0
20
  cymem==2.0.7
 
21
  dataclasses-json==0.5.7
 
 
 
 
 
 
22
  distro==1.8.0
 
 
23
  en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl
24
+ exceptiongroup==1.1.1
25
+ faiss-cpu==1.7.4
 
 
 
26
  fastapi==0.95.2
 
27
  ffmpy==0.3.0
28
  filelock==3.12.0
 
 
29
  fonttools==4.39.4
30
  frozenlist==1.3.3
31
  fsspec==2023.5.0
32
+ gradio==3.32.0
33
+ gradio_client==0.2.5
 
 
 
 
 
 
 
 
 
 
 
34
  h11==0.14.0
35
+ httpcore==0.17.2
 
 
 
 
36
  httpx==0.24.1
37
  huggingface-hub==0.14.1
38
  idna==3.4
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  Jinja2==3.1.2
 
40
  jsonschema==4.17.3
 
 
 
 
 
 
41
  kiwisolver==1.4.4
42
+ langchain==0.0.186
43
  langcodes==3.3.0
 
 
44
  linkify-it-py==2.0.2
45
+ lxml==4.9.2
 
 
 
 
46
  markdown-it-py==2.2.0
 
47
  MarkupSafe==2.1.2
48
  marshmallow==3.19.0
49
  marshmallow-enum==1.5.1
50
+ matplotlib==3.7.1
 
51
  mdit-py-plugins==0.3.3
52
  mdurl==0.1.2
53
+ mmda==0.4.8
54
+ mpmath==1.3.0
 
 
 
 
 
 
55
  multidict==6.0.4
 
 
56
  murmurhash==1.0.9
57
+ mypy-extensions==1.0.0
58
+ ncls==0.0.66
59
+ necessary==0.4.2
60
+ networkx==3.1
 
 
61
  numexpr==2.8.4
62
+ numpy==1.24.3
63
+ openai==0.27.7
 
64
  openapi-schema-pydantic==1.2.4
65
+ orjson==3.8.14
66
+ packaging==23.1
67
+ pandas==1.5.3
 
 
 
 
 
 
 
68
  pathy==0.10.1
69
+ pdf2image==1.16.3
70
+ pdfminer.six==20220524
71
+ pdfplumber==0.7.4
 
 
 
 
72
  Pillow==9.5.0
 
 
 
73
  preshed==3.0.8
 
 
 
 
 
 
 
 
 
 
 
74
  pycparser==2.21
75
+ pydantic==1.10.8
 
76
  pydub==0.25.1
77
+ Pygments==2.15.1
 
78
  pyparsing==3.0.9
 
79
  pyphen==0.14.0
 
 
80
  pyrsistent==0.19.3
 
 
81
  python-dateutil==2.8.2
82
+ python-dotenv==1.0.0
 
 
83
  python-multipart==0.0.6
 
 
 
84
  pytz==2023.3
 
85
  PyYAML==6.0
86
+ regex==2023.5.5
87
+ requests==2.31.0
88
+ requirements-parser==0.5.0
89
+ scipdf==0.1.dev0
 
 
 
 
 
 
 
 
 
 
 
 
90
  semantic-version==2.10.0
 
 
 
 
91
  six==1.16.0
 
92
  smart-open==6.3.0
 
93
  sniffio==1.3.0
94
+ soupsieve==2.4.1
 
95
  spacy==3.5.3
96
  spacy-legacy==3.0.12
97
  spacy-loggers==1.0.4
98
+ SQLAlchemy==2.0.15
 
 
99
  srsly==2.4.6
 
100
  starlette==0.27.0
101
+ sympy==1.12
 
102
  tabula-py==2.7.0
 
 
103
  tenacity==8.2.2
 
 
 
 
104
  textstat==0.7.3
105
  thinc==8.1.10
 
 
106
  tiktoken==0.4.0
107
+ tokenizers==0.13.3
 
 
 
 
 
 
 
108
  toolz==0.12.0
109
+ torch==2.0.1
110
+ tqdm==4.65.0
111
+ transformers==4.29.2
 
 
 
 
 
112
  typer==0.7.0
113
+ types-setuptools==67.8.0.0
114
+ typing-inspect==0.9.0
115
+ typing_extensions==4.6.2
116
  uc-micro-py==1.0.2
117
+ urllib3==2.0.2
 
 
 
118
  uvicorn==0.22.0
119
+ Wand==0.6.11
 
 
 
120
  wasabi==1.1.1
 
 
 
 
121
  websockets==11.0.3
 
 
 
 
 
 
 
122
  yarl==1.9.2
 
sources/Nature-Scientific-Data/A whole-body FDG-PET:CT.pdf ADDED
The diff for this file is too large to render. See raw diff
 
sources/Nature-Scientific-Data/A whole-body FDG.txt ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ A whole-body FDG-PET/CT Dataset with manually annotated Tumor Lesions
3
+
4
+ Sergios Gatidis, Tobias Hepp1 Marcel Früh, Christian La Fougère, Konstantin Nikolaou Christina Pfannenberg, Bernhard Schölkopf, Thomas Küstner, Clemens Cyran & Daniel Rubin
5
+
6
+
7
+ We describe a publicly available dataset of annotated Positron Emission Tomography/Computed
8
+ Tomography (PET/CT) studies. 1014 whole body Fluorodeoxyglucose (FDG)-PET/CT datasets (501
9
+ studies of patients with malignant lymphoma, melanoma and non small cell lung cancer (NSCLC) and
10
+ 513 studies without PET-positive malignant lesions (negative controls)) acquired between 2014 and
11
+ 2018 were included. All examinations were acquired on a single, state-of-the-art PET/CT scanner. The
12
+ imaging protocol consisted of a whole-body FDG-PET acquisition and a corresponding diagnostic CT
13
+ scan. All FDG-avid lesions identified as malignant based on the clinical PET/CT report were manually
14
+ segmented on PET images in a slice-per-slice (3D) manner. We provide the anonymized original DICOM
15
+ files of all studies as well as the corresponding DICOM segmentation masks. In addition, we provide
16
+ scripts for image processing and conversion to different file formats (NIfTI, mha, hdf5). Primary
17
+ diagnosis, age and sex are provided as non-imaging information. We demonstrate how this dataset
18
+ can be used for deep learning-based automated analysis of PET/CT data and provide the trained deep
19
+ learning model.
20
+
21
+ Background & Summary: Integrated Positron Emission Tomography/Computed Tomography (PET/CT) has been established as a central
22
+ diagnostic imaging modality for several mostly oncological indications over the past two decades. The unique
23
+ strength of this hybrid imaging modality lies in its capability to provide both, highly resolved anatomical information by CT as well as functional and molecular information by PET. With growing numbers of performed
24
+ examinations, the emergence of novel PET tracers and the increasing clinical demand for quantitative analysis
25
+ and reporting of PET/CT studies is becoming increasingly complex and time consuming. To overcome this
26
+ challenge, the implementation of machine learning algorithms for faster, more objective and quantitative medical image analysis has been proposed also for the analysis of PET/CT data. First methodological studies have
27
+ demonstrated the feasibility of using deep learning frameworks for the detection and segmentation of metabolically active lesions in whole body Fluorodeoxyglucose (FDG)-PET/CT of patients with lung cancer, lymphoma
28
+ and melanoma1–4. Despite these encouraging results, deep learning-based analysis of PET/CT data is still not
29
+ established in routine clinical settings. Thus, automated medical image analysis, specifically of PET/CT images
30
+ is an ongoing field of research that requires methodological advances to become clinically applicable. In contrast
31
+ to the more widely used imaging modalities CT and MRI however, only few datasets of PET/CT studies are
32
+ publicly accessible to clinical and machine learning scientists who work on automated PET/CT analysis. Even
33
+ fewer datasets contain image-level ground truth labels to be used for machine learning research5,6. This is likely
34
+ a major obstacle for innovation and clinical translation in this field. Examples of related areas, such as analysis
35
+ of dermoscopy7 or retinal images8, show that the existence of publicly available labeled datasets can serve as a
36
+ catalyst for method development and validation. The purpose of this project is thus to provide an annotated,
37
+ publicly available dataset of PET/CT images that enables technical and clinical research in the area of machine
38
+ learning-based analysis of PET/CT studies and to demonstrate a use case of deep learning-based automated
39
+ segmentation of tumor lesions. To this end, we composed a dataset of 1,014 oncologic whole-body FDG-PET/
40
+ CT examinations of patients with lymphoma, lung cancer and malignant melanoma, as well as negative controls
41
+ together with voxel-wise manual labels of metabolically active tumor lesions. The provided data can be used by
42
+ researchers of different backgrounds for the development and evaluation of machine learning methods for PET/
43
+ CT analysis as well as for clinical research regarding the included tumor entities.
44
+
45
+
46
+ Table 1. Patient characteristics across the dataset subcategories.
47
+
48
+ Methods
49
+ Data collection: Publication of anonymized data was approved by the institutional ethics committee of the
50
+ Medical Faculty of the University of Tübingen as well as the institutional data security and privacy review board.
51
+ Data from 1,014 whole-body FDG-PET/CT examinations of 900 patients acquired between 2014 and 2018 as
52
+ part of a prospective registry study9 were included in this dataset. Of these 1,014 examinations, 501 are positive
53
+ samples, meaning they contain at least one FDG-avid tumor lesion and 513 are negative samples, meaning they
54
+ do not contain FDG-avid tumor lesions. Negative samples stem from patients who were examined by PET/CT
55
+ with a clinical indication (e.g. follow-up after tumor resection) but did not show any findings of metabolically
56
+ active malignant disease. The selection criteria for positive samples were: age >18 years, histologically confirmed
57
+ diagnosis of lung cancer, lymphoma or malignant melanoma, and presence of at least one FDG-avid tumor lesion
58
+ according to the final clinical report. The selection criteria for negative samples were: age >18 years, no detectable
59
+ FDG-avid tumor lesion according to the clinical radiology report. Of the 501 positive studies, 168 were acquired
60
+ in patients with lung cancer, 145 in patients with lymphoma and 188 in patients with melanoma. Patient characteristics are summarized in Table 1.
61
+
62
+ PET/CT Acquisition: All PET/CT examinations were performed at the University Hospital Tübingen
63
+ according to a standardized acquisition protocol on a single clinical scanner (Siemens Biograph mCT, Siemens
64
+ Healthineers, Knoxville, USA) following international guidelines for oncologic PET/CT examinations (Boellaard
65
+ et al. FDG PET/CT: EANM procedure guidelines for tumour imaging: version 2.0)10.
66
+ Diagnostic whole-body CT was acquired in expiration with arms elevated according to a standardized protocol using the following scan parameters: reference tube current exposure time product, 200 mAs with automated
67
+ exposure control (CareDose); tube voltage, 120 kV. CT examinations were performed with weight-adapted
68
+ 90–120 ml intravenous CT contrast agent in a portal-venous phase (Ultravist 370, Bayer Healthcare) or without
69
+ contrast agent (in case of existing contraindications). CT data were reconstructed in transverse orientation with
70
+ a slice thickness between 2.0 mm and 3.0 mm with an in-plane voxel edge length between 0.7 and 1.0 mm.
71
+ F-FDG was injected intravenously after at least 6 hours of fasting. PET acquisition was initiated 60 minutes after injection of a weight-adapted dose of approximately 300 MBq 18F-FDG (mean: 314.7 MBq, SD:
72
+ 22.1 MBq, range: [150, 432] MBq). For the purpose of weight adaptation, target FDG injection acitivities
73
+ were 250–300 MBq/300–350 MBq/350–400 MBq for patients with a body weight below 60 kg/between 60 and
74
+ 100 kg/above 100 kg respectively. PET was acquired over four to eight bed positions (usually from the skull
75
+ base to the mid-thigh level) and reconstructed using a 3D-ordered subset expectation maximization algorithm
76
+ (two iterations, 21 subsets, Gaussian filter 2.0 mm, matrix size 400 × 400, slice thickness 3.0 mm, voxel size of
77
+ 2.04 × 2.04 × 3 mm3). PET acquisition time was 2 min per bed position. Example PET/CT images are displayed
78
+ in Fig. 1a.
79
+
80
+ Fig. 1 Dataset properties. (a) Coronal views of CT (left) and FDG-PET (right) image volumes without
81
+ pathologic findings. (b) Example of manual tumor segmentation (bottom image, green area) of a lung cancer
82
+ mass; top: CT, middle: FDG-PET (c) Distribution of mean SUV, MTV and TLG of studies in patients with lung
83
+ cancer (blue), lymphoma (red) and melanoma (yellow).
84
+
85
+
86
+ Data labeling and processing: All examinations were assessed by a radiologist and nuclear medicine
87
+ specialist in a clinical setting. Based on the report of this clinical assessment, all FDG-avid tumor lesions (primary tumor if present and metastases if present) were segmented by an experienced radiologist (S.G., 10 years of experience in hybrid imaging) using dedicated software (NORA image analysis platform, University of Freiburg,
88
+ Germany). In case of uncertainty regarding lesion definition, the specific PET/CT studies were reviewed in
89
+ consensus with the radiologist and nuclear medicine physician who prepared the initial clinical report. To this
90
+ end CT and corresponding PET volumes were displayed side by side or as an overlay and tumor lesions showing
91
+ elevated FDG-uptake (visually above blood-pool levels) were segmented in a slice-per-slice manner resulting in
92
+ 3D binary segmentation masks. An example slice of a segmented tumor lesion is shown in Fig. 1b. DICOM data
93
+ of PET/CT volumes and corresponding segmentation masks were anonymized upon data upload to The Cancer
94
+ Imaging Archive11 using the CTP DICOM anonymizer tool.
95
+
96
+ Data properties: Of the 1014 studies (900 unique patients) included in this dataset, one study was included
97
+ of 819 patients, two studies were included of 59 patients, 3 studies of 14 patients, 4 studies of 4 patients and 5
98
+ studies of 3 patients. The mean coverage (scan range) of the PET volumes in the longitudinal direction over all
99
+ datasets was 1050.7 mm (SD: 306.7 mm, min: 600 mm, max: 1983 mm). The three included tumor entities showed
100
+ similar distributions with respect to metabolic tumor volume (MTV), mean SUV of tumor lesions and total lesion
101
+ glycolysis (TLG) (Fig. 1c). Overall, in non-negative studies, MTV, mean SUV and TLG amounted to (mean ± SD)
102
+ 219.9 ± 342.7 ml, 5.6 ± 2.7 and 1330 ± 2296 ml, respectively. For lung cancer studies, these values were
103
+ 263.6 ± 345.1 ml, 4.4 ± 1.5 and 1234 ± 1622 ml. For lymphoma studies these values 297.5 ± 393.1 ml, 6.3 ± 2.7
104
+ and 2042 ± 2941.4 ml. For melanoma studies these values were 121.2 ± 269.4 ml, 6.2 ± 3.1 and 867.3 ± 2113.8 ml.
105
+
106
+ Data Records: This dataset can be accessed on The Cancer Imaging Archive (TCIA) under the collection name
107
+ “FDG-PET-CT-Lesions”12.
108
+
109
+ DICOM data: Each individual PET/CT dataset consists of three image series stored in the DICOM format: a whole-body CT volume stored as a DICOM image series, a corresponding whole-body FDG-PET volume stored as a DICOM image series and a binary segmentation volume stored in the DICOM segmentation object format. The entire DICOM dataset consists of 1,014 image studies, 3,042 image series and a total of 916,957 single
110
+ DICOM files (total size of approximately 419 GB). The directory structure of the DICOM dataset is depicted in
111
+ Fig. 2a. Patients are identified uniquely by their anonymized patient ID.
112
+
113
+ Conversion to other image formats: To facilitate data usage, we provide Python scripts that allow conversion of DICOM data to other medical image formats (NIfTI and mha) as well as the hdf5 format. (https://
114
+ github.com/lab-midas/TCIA processing). In addition to file conversion, these scripts generate processed image
115
+ volumes: a CT volume resampled to the PET volume size and resolution as well as a PET volume with voxel values
116
+
117
+ Metadata: In addition to imaging data, a metadata file in Comma-separated Values (csv) format is provided
118
+ containing information on study class (lung cancer, melanoma, lymphoma or negative), patient age (in years) and
119
+ patient sex. In addition, the DICOM header data include information about patient body weight, injected activity
120
+ and whether CT was contrast-enhanced (in case of non-enhanced CT, the CT series description includes the key
121
+ word “nativ”).
122
+
123
+
124
+ Fig. 2 Dataset structure. Patients are identified by a unique, anonymized ID and all studies of a single patient
125
+ are stored under the respective patient path. (a) DICOM data: Each study folder contains three subfolders with
126
+ DICOM files of the PET volume, the CT volume and the segmentation mask. (b) NIfTI data: Using the provided
127
+ conversion script, DICOM data can be converted to NIfTI files. In addition to NIfTI files of the PET volume
128
+ (PET.nii.gz), the CT volume (CT.nii.gz) and the segmentation mask (SEG.nii.gz), this script generates NIfTI
129
+ volumes of the PET image in SUV units (SUV.nii.gz) and a CT volume resample to the PET resolution and
130
+ shape (CTres.nii.gz).
131
+
132
+ Fig. 3 Training and evaluation. (a) Representative loss curve on training data (blue) and validation data (red)
133
+ from one fold of a 5-fold cross validation. (b) Schematic visualization of the proposed evaluation metrics false
134
+ positive and false negative volumes (in addition to the Dice score).
135
+ converted to standardized uptake values (SUV). The data structure of the generated NIfTI files is represented in
136
+ Fig. 2b. Data in the other formats (mha and hdf5) are generated accordingly.
137
+
138
+ Fig. 4 Quantitative evaluation of automated lesion segmentation. Top left: Correlation of automatically
139
+ predicted tumor volume with ground truth tumor volumes from manual segmentation in positive studies. Top
140
+ right: Distribution of Dice coefficients for automated versus manual tumor segmentation in positive studies.
141
+ Bottom left: Distribution of false negative volumes over all positive studies. Bottom right: Distribution of false
142
+ positive volumes over all studies.
143
+
144
+ Technical Validation: Deep Learning-based Lesion Segmentation. In order to provide a use case scenario for the provided dataset we trained and evaluated a deep learning model
145
+ for automated PET lesion segmentation. To this end, we used a standardized and publicly available deep learning
146
+ framework for medical image segmentation (nnUNet13). This framework is based on a 3D U-Net architecture
147
+ and provides an automated adaptive image processing pipeline. PET volumes converted to SUV units (SUV.
148
+ nii.gz, Fig. 2b) and corresponding re-sampled CT volumes (CTres.nii.gz, Fig. 2b) were used as model inputs.
149
+ Training with 5-fold cross validation was performed using the pre-configured model parameters with maximum
150
+ number of epochs set to 1,000 and an initial learning rate of 1e-4 in a dedicated GPU (NVIDIA A5000). Typical
151
+ loss and validation curves of a single validation step are depicted in Fig. 3a. For validation of algorithm performance, three metrics were used: Dice score, false positive volume and false negative volume (Fig. 3b). False positive volume was defined as the volume of false positive connected components in the predicted segmentation
152
+ mask that do not overlap with tumor regions in the ground truth segmentation mask. This can be e.g. areas of
153
+ physiological FDG-uptake (e.g. brain, heart, kidneys) that are erroneously classified as tumor. False negative
154
+ volume was defined as the volume of connected components in the ground truth segmentation mask (=tumor
155
+ lesions) that do not overlap with the estimated segmentation mask. These are tumor lesions that are entirely
156
+ missed by the segmentation algorithm. In case of negative examples without present tumor lesions in the ground
157
+ truth segmentation, only false positive volume was applicable as a metric.
158
+ We introduce these additional metrics (false positive and false negative volumes) due to the specific requirements of automated lesion segmentation. The specific challenge in automated segmentation of FDG-avid
159
+ lesions in PET is to avoid false-positive segmentation of anatomical structures that have physiologically high
160
+ FDG-uptake (e.g. brain, kidney, heart, etc.) while capturing all - even small - tumor lesions. The Dice score alone
161
+ does not differentiate between false positive or negative segmentation within a correctly detected lesion (e.g.
162
+ along its borders) and false positive or negative segmentations unconnected to detected lesions (i.e. false positive
163
+ segmentation of healthy tissue or entirely missed lesions).
164
+ Overall, automated lesion segmentation using the described deep learning model showed good agreement
165
+ with manual ground truth segmentation (Fig. 4). On datasets containing lesions, a high correlation of MTVs was
166
+ observed between automated and manual segmentation (r = 0.85). The mean Dice score of automated compared
167
+ to manual lesion segmentation was 0.73 (±0.23) on positive datasets. Mean false positive/false negative volumes
168
+ were 8.1 (±81.4) ml/15.1 (±80.3) ml respectively. Quantitative algorithm performance results on validation
169
+ data (5-fold cross validation) are summarized in Fig. 4. Figure 5 provides qualitative examples for automated
170
+ segmentation results.
171
+ This presented use case scenario demonstrates how this dataset can be used for the development and validation
172
+ of algorithms for analysis of PET/CT data. We observed overall high automated segmentation performance that
173
+ is comparable to previous studies focusing on specific disease entities4,14. In combination with methodological
174
+ advances in the fields of machine learning and computer vision, this dataset can thus contribute to the development of increasingly accurate, robust and clinically useful algorithms for PET/CT analysis.Beyond automated
175
+ lesion segmentation, this dataset bears the potential to be used for further tasks such as automated organ segmentation or automated lesion tracking. This would require further annotations which can be integrated with relatively
176
+ low additional effort. For example, the recently published MOOSE framework15 for automated organ segmentation on PET/CT data can be directly applied to this dataset providing e.g. information about lesions localization.
177
+
178
+ Fig. 5 Examples of automated lesion segmentation. (a) Example showing excellent agreement between
179
+ manual (green) and automated (blue) tumor segmentation in a patient with lymphoma. Black arrows point
180
+ to physiological FDG-uptake in the brain, heart, bowel and urinary bladder (from top to bottom) that was
181
+ correctly not segmented. (b) Example of false positive segmentation of physiological structures with elevated
182
+ FDG-uptake. Top: False positive partial segmentation of the left kidney. Bottom: False positive partial
183
+ segmentation of back muscles.
184
+
185
+ Usage Notes: For the purpose of visualization, image data can be loaded using freely available medical image data viewers such
186
+ as the Medical Imaging Interaction Toolkit (https://www.mitk.org/) or 3D Slicer (https://www.slicer.org/). For
187
+ the purpose of computational data analysis e.g. in Python, 3D image volumes can be read using freely available
188
+ software such as pydicom (https://pydicom.github.io/) or nibabel (https://nipy.org/packages/nibabel/index.html).
189
+ The data presented in this manuscript is part of the MICCAI autoPET challenge 2022 (https://autopet.
190
+ grand-challenge.org/).
191
+
192
+ Code availability: We provide the code of the data conversion and processing pipeline under https://github.com/lab-midas/TCIA
193
+ processing. The trained PET/CT lesion segmentation model is publicly available under https://github.com/labmidas/autoPET/tree/master/.
194
+ Received: 28 June 2022; Accepted: 23 September 2022;
195
+ Published: xx xx xxxx
196
+
197
+ Acknowledgements
198
+ This project was partly supported by intramural grants of Stanford University and the University of Tübingen.
199
+ This project was conducted under Germany’s Excellence Strategy–EXC-Number 2064/1–390727645 and EXC
200
+ 2180/1-390900677.
201
+
202
+ Author contributions
203
+ S.G., D.R., T.K., T.H.: conception and design of the work, acquisition, analysis and interpretation of data, creation
204
+ of new software used in the work, drafting of the manuscript. K.N., C.L.F., M.F., C.P., B.S., C.C.: discussion and
205
+ interpretation of results, substantial revision of the manuscript. All authors reviewed the manuscript.
206
+ Table 1: This table provides information about the patient characteristics across the dataset subcategories. It includes the diagnosis, patient sex, number of studies, and age (mean and standard deviation). The table includes information for Melanoma, Lymphoma, Lung Cancer, Negative, and All. The All category includes information for both male and female patients.
207
+ Table 1: This table provides information about the patient characteristics across the dataset subcategories. It includes the diagnosis, patient sex, number of studies, and age (mean and standard deviation). The table includes information for Melanoma, Lymphoma, Lung Cancer, Negative, and All. The All category includes information for both male and female patients. For female patients, there were 77 studies with an average age of 65.0 ± 12.8. For male patients, there were 111 studies with an average age of 65.7 ± 13.7. For Melanoma, there were no studies included in the table. For Lymphoma, there were 76 male studies with an average age of 47.3 ± 17.9 and 69 female studies with an average age of 45.1 ± 19.7. For Lung Cancer, there were 103 male studies with an average age of 67.0 ± 9.0 and 65 female studies with an average age of 64.2 ± 8.7. For Negative, there were 280 male studies with an average age of 58.7 ± 15.1 and 233 female studies with an average age of 59.1 ± 14.7. For All, there were 570 male studies with an average age of 60.1 ± 15
208
+ Table 1: This table provides information about the patient characteristics across the dataset subcategories. For female patients, there were 77 studies with an average age of 65.0 ± 12.8. For male patients, there were 111 studies with an average age of 65.7 ± 13.7. For Melanoma, there were no studies included in the table. For Lymphoma, there were 76 male studies with an average age of 47.3 ± 17.9 and 69 female studies with an average age of 45.1 ± 19.7. For Lung Cancer, there were 103 male studies with an average age of 67.0 ± 9.0 and 65 female studies with an average age of 64.2 ± 8.7. For Negative, there were 280 male studies with an average age of 58.7 ± 15.1 and 233 female studies with an average age of 59.1 ± 14.7. For All, there were 570 male studies with an average age of 60.1 ± 15.9.
sources/Nature-Scientific-Data/Lontar-Manuscripts.pdf ADDED
The diff for this file is too large to render. See raw diff
 
sources/Nature-Scientific-Data/Lontar-Manuscripts.txt ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ DeepLontar dataset for handwritten Balinese character detection and syllable recognition on Lontar manuscript
3
+
4
+ Daniel Siahaan , Ni Putu Sutramiani, Nanik Suciati, I Nengah Duija, I Wayan Agus Surya Darma
5
+
6
+
7
+ The digitalization of traditional Palmyra manuscripts, such as Lontar, is the government’s main focus
8
+ in efforts to preserve Balinese culture. Digitization is done by acquiring Lontar manuscripts through
9
+ photos or scans. To understand Lontar’s contents, experts usually carry out transliteration. Automatic
10
+ transliteration using computer vision is generally carried out in several stages: character detection,
11
+ character recognition, syllable recognition, and word recognition. Many methods can be used for
12
+ detection and recognition, but they need data to train and evaluate the resulting model. In compiling
13
+ the dataset, the data needs to be processed and labelled. This paper presented data collection and
14
+ building datasets for detection and recognition tasks. Lontar was collected from libraries at universities
15
+ in Bali. Data generation was carried out to produce 400 augmented images from 200 Lontar original
16
+ images to increase the variousness of data. Annotations were performed to label each character
17
+ producing over 100,000 characters in 55 character classes. This dataset can be used to train and evaluate
18
+ performance in character detection and syllable recognition of new manuscripts.
19
+
20
+ Background & Summary: Ancient manuscript digitization is a necessary process to support the preservation of cultural heritage to avoid
21
+ document destruction. The digitization process is carried out through the acquisition of ancient manuscript
22
+ documents into digital images. Then, digital images can be further processed through the computer vision
23
+ method to extract the information in the ancient manuscript document. Balinese Lontar manuscript is a historical document used by ancient people in Bali to store important information related to ancient science, such as
24
+ traditional medicine, farming techniques, determining auspicious days, and others.
25
+ In the ancient Balinese community, traditions, instructions, and drugs ingredients were documented by
26
+ officials or scholars as Lontar manuscripts in Balinese characters. The writing process on the Balinese Lontar
27
+ manuscript uses a special knife called a pengrupak on dried palm leaves. Then, roasted candlenut powder is
28
+ used to give colour to the written Balinese characters. Balinese writers did the writing of Lontar to store various important information in ancient times. The Balinese characters used have unique writing characteristics.
29
+ Characters are written without spaces. There are combinations of characters to form syllables, dense and overlapping characters, and sticking together. DeepLontar dataset can be used for syllables recognition by combining
30
+ each character by applying special rules. This dataset is very challenging because it can only be read and translated by experts.
31
+ Balinese Lontar publicly available datasets are available on a very limited basis. Therefore, related research
32
+ has been carried out for assembling datasets for Balinese Lontar manuscripts. Windu et al.1 proposed AMADI_
33
+ LontarSet that consists of bi-level images as gold standard dataset, image datasets with word-level annotations
34
+ and isolated glyphs. The resulting performance is only below 50% due to the use of isolated character images,
35
+ which do not label every character in the Balinese Lontar manuscript. Other studies related to Balinese characters have been carried out, starting with Balinese character segmentation2, Balinese character recognition3,
36
+ Balinese character augmentation in increasing data variation4, and Balinese character detection based on deep
37
+ learning5. In the case of ancient Chinese documents, two main datasets were proposed. The datasets were
38
+ annotated with characters, including gold-standard character bounding boxes and its corresponding glyphs6.
39
+ Furthermore, a new augmentation method was introduced based on the fusion of general transfiguration with
40
+ local deformation and successfully enlarged the training dataset7. In the case of Indian documents, thorough
41
+ experimentations were performed on other corpus comprising in print and in-writing texts8. Other studies
42
+ proposed the IFN/ENIT dataset to surmount the dearth of Arabic datasets easily accessible for researchers9 and
43
+ a popular literature Arabic/English dataset: Everyday Arabic-English Scene Text dataset (EvArEST) for Arabic
44
+ text recognition10. Other researchers proposed Ekush dataset for Bangla handwritten text recognition11, Tamil
45
+ dataset for in-writing Tamil character recognition utilizing deep learning12,13, DIDA dataset for detection and
46
+ recognize in-writing numbers in ancient manuscript drawings dated from the nineteen century14.
47
+ Based on previous research, we proposed DeepLontar, a dataset for handwritten Balinese character detection
48
+ and syllable recognition on the Lontar manuscript. DeepLontar consists of 600 images of the Balinese Lontar
49
+ manuscript that have been annotated and validated by experts. This dataset was built through the process of
50
+ acquisition (200 original images), data generation (400 augmented images), data annotation, and expert validation. This dataset has been tested on the detection and recognition process of Balinese characters using the
51
+ YOLOv4 model. The original dataset was split into train and test data with distribution ratio of 60%:40%. Three
52
+ datasets were prepared. The first dataset, i.e. the original dataset, was split into 120 original images in the train
53
+ data and 80 original images in the test data. In the second dataset, 200 augmented images (produced by the
54
+ grayscale augmentation technique) were added into the train data. In the third dataset, another 200 augmented
55
+ images (produced by adaptive gaussian thresholding technique) were added into the train data. In those three
56
+ dataset, the YOLOv4 model produces a detection performance with mean average precision (mAP) of up to
57
+ 99.55% with precision, recall, and F1-score are 99%, 100%, and 99%, respectively5. DeepLontar consists of 55
58
+ Balinese character classes. These classes are used in writing Balinese script in Lontar Manuscripts. The entire
59
+ vocabulary in the DeepLontar dataset uses these 55-character classes. DeepLontar have been annotated and
60
+ validated by experts.
61
+ Each annotated character class has a high variation because it is written using a pengrupak, and the characters
62
+ are handwritten. The high character variation makes this dataset very challenging for detecting and recognizing
63
+ syllables in Balinese Lontar manuscripts. Figure 1 shows a sample image of Balinese lontar manuscript. The
64
+ Balinese character classes that have been annotated in the Balinese Lontar manuscript.
65
+ The lontar manuscripts are written using Balinese characters. The writing uses a special knife called pengrupak by scraping dry palm leaves so that Balinese characters are engraved on the manuscript. The coloring
66
+ process uses roasted candlenut powder, making the engraved characters black.
67
+ Figure 2 shows the acquisition process of Balinese Lontar manuscripts. It is carried out using a scanner.
68
+ This process is carried out on 200 pieces of Balinese Lontar manuscript. To enrich the variety and increase the
69
+ amount of data, we apply data generation using augmentation techniques. Based on the data generation process,
70
+ we produced 400 augmented images of the Balinese Lontar manuscript. Figure 3 shows variations of the Balinese
71
+ Lontar manuscript image in the DeepLontar dataset.
72
+ Figure 4 shows an annotated image of the Balinese Lontar manuscript. The annotation process uses LabelImg
73
+ by labeling each Balinese character. Then, it aims to label the Balinese character class and position in the Balinese
74
+ Lontar manuscript. We have tested the DeepLontar dataset using a deep learning architecture for detecting and
75
+ recognizing Balinese characters in the Balinese Lontar manuscript shown in Fig. 5. In general, each character has
76
+ been successfully detected, and its class recognized accurately with a confidence level of 99%. Figure 6 Examples
77
+ of Balinese character detection and recognition results in DeepLontar dataset.
78
+
79
+
80
+
81
+ Methods: The process of compiling the dataset was carried out in four stages. Each stage was shown in Fig. 5, starting with
82
+ data acquisition, data generation, data annotation, and validation. The first stage was data acquisition by scanning the Lontar manuscript using a scanner. Figure 3 shows the scan process per sheet of Lontar manuscripts.
83
+ The Lontar manuscript was scanned in a horizontal position according to the characteristics of the elongated
84
+ Lontar. This process produced 200 Lontar images. Furthermore, the second stage was to perform data generation
85
+ with two augmentation techniques. The augmentation technique used grayscale and adaptive gaussian thresholding for increasing the variety of data. The grayscale augmentation is used in order for the model to put lesser
86
+ importance on colour as a signal. The adaptive gaussian thresholding is utilized to sharpen the character image.
87
+ This process produced 400 augmented images, which have been enhanced. Overall, the number of initial images
88
+ and the augmented images of Lontar manuscript was 600 images. Table 1 shows he complete character set of
89
+ Balinese character classes in DeepLontar dataset. It also shows the average precisions of character detection
90
+ model trained on original dataset (ori) and trained on augmented dataset (aug). It indicates that the augmentation technique does improve the average precision (AP).
91
+ Although DeepLontar dataset does contain out of vocabulary classes, it suffers from imbalance problem. The
92
+ da madu class rarely appear in the dataset. As we can see in Table 1, the augmentation technique helps improves
93
+ the average precision of the detection model.
94
+
95
+ The third stage was character annotation using the LabelImg application. The Balinese character originally
96
+ consists of 75 character classes, but not all character classes are used in writing lontar manuscript. Therefore, to
97
+ determine the number of character classes, we have involved experts in determining the character classes that
98
+ are often used in writing Lontar manuscript. Image annotation was done to label the image, which was used as
99
+ ground truth. The bounding box was used to annotate each character. This process was carried out by a team and
100
+ accompanied by experts. Character annotations produced 102,966 characters came from 55 character classes.
101
+ The annotation results stored the spatial location of each character object within the observed image. The character class is annotated with the bounding box, its spatial location, and its two-dimensional size. Balinese character annotation in the Lontar manuscript produced a new Balinese character dataset for identifying Balinese
102
+ glyphs called DeepLontar. The last stage was data validation. Based on the result of our experimentation, the
103
+ dataset was able to produce up to 99.55% performance.
104
+
105
+
106
+ Data Records: DeepLontar dataset is freely accessible to the researchers at Figshare15. DeepLontar consisted of 600 images of
107
+ Balinese Lontar manuscripts and additionally, 600 *.txt files that stored information related to data annotations
108
+ in YOLO format. Balinese character annotations in DeepLontar consisted of more than 100,000 characters that
109
+ experts had validated. All files are named in the following format:
110
+ • JPEG images: < filename > .jpg, for instance: 1a.jpg, and
111
+ • TXT annotations: < filename > .txt, for instance: 1a.txt,
112
+ Annotation files format follows the YOLO format, as follow:
113
+ • <ID> <x> <y> <width> <height>, for instance: 54 0.068000 0.083333 0.016000 0.073333
114
+ where <ID> is the object class ID, <x> is x coordinate, <y> is y coordinate, <width> is width of the bounding box, and <height> is heigh of the bounding box. Table 1 shows 55 Balinese character classes.
115
+
116
+ Technical Validation: Data validation was carried out in two ways: validation from experts and testing using one of the deep learning
117
+ methods, namely YOLO. Validation by experts was carried out when making ground truth of Balinese characters in Lontar manuscripts. The second validation was a trial with detecting and recognizing Balinese characters
118
+ using YOLO.
119
+
120
+ Usage Notes: DeepLontar dataset images are published and bundled into one compressed file (.zip) named DeepLontar.zip.
121
+ The annotation files are published and bundled into one compressed file (.zip) named DeepLontar_labels.zip.
122
+
123
+ Code availability: The images data are available at Figshare repository15 and data augmentation code are available using OpenCV
124
+ library. Data annotation tool using LabelImg is available online16.
125
+ Received: 7 July 2022; Accepted: 17 November 2022;
126
+
127
+ Acknowledgements: The study is supported by the Directorate General of Higher Education, Ministry of Education and Culture
128
+ Republic of Indonesia under grant number 1564/PKS/ITS/2022.
129
+
130
+ Author contributions: D.S. and N.S.: supervision, funding acquisition, project administration, review and editing.
131
+
132
+ Open Access: This article is licensed under a Creative Commons Attribution 4.0 International
133
+ License, which permits use, sharing, adaptation, distribution and reproduction in any medium or
134
+ format, as long as you give appropriate credit to the original author(s) and the source, provide a link to the Creative
135
+ Commons license, and indicate if changes were made. The images or other third party material in this
136
+ article are included in the article’s Creative Commons license, unless indicated otherwise in a credit line to the
137
+ material. If material is not included in the article’s Creative Commons license and your intended use is not permitted
138
+ by statutory regulation or exceeds the permitted use, you will need to obtain permission directly from the
139
+ copyright holder. To view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/.
140
+
141
+ Table 1: This table contains information about the DeepLontar dataset for handwritten Balinese character detection and syllable recognition on Lontar manuscript. The table contains two columns, Character Classes and Freq, which list the character classes and their respective frequencies. The table also contains two columns, AP (%) and AP (%), which list the average precision of the character detection model trained on the original dataset (ori) and trained on the augmented dataset (aug). The table also contains two additional columns, Unnamed: 0 and Unnamed: 1, which list the average precision of the character detection model trained on the original dataset and the augmented dataset, respectively.
142
+
143
+
144
+ Table 2: This table contains information about the DeepLontar dataset for handwritten Balinese character detection and syllable recognition on Lontar manuscript. The table contains two columns, Character Classes and Freq, which list the different character classes and their respective frequencies. The table also contains two columns, AP (%) and AP (%).1, which list the average precisions of character detection models trained on the original dataset (ori) and trained on the augmented dataset (aug). The table also contains two unnamed columns.
145
+
146
+