lhoestq HF staff commited on
Commit
019fb90
1 Parent(s): fbd2a96

Upload 9 files

Browse files
Files changed (9) hide show
  1. Dockerfile +106 -0
  2. README.md +9 -4
  3. data/hf_spark_utils.py +183 -0
  4. data/spark.ipynb +121 -0
  5. login.html +70 -0
  6. on_startup.sh +5 -0
  7. packages.txt +2 -0
  8. requirements.txt +8 -0
  9. start_server.sh +23 -0
Dockerfile ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.3.1-base-ubuntu20.04
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive \
4
+ TZ=Europe/Paris
5
+
6
+ # Remove any third-party apt sources to avoid issues with expiring keys.
7
+ # Install some basic utilities
8
+ RUN rm -f /etc/apt/sources.list.d/*.list && \
9
+ apt-get update && apt-get install -y --no-install-recommends \
10
+ curl \
11
+ ca-certificates \
12
+ sudo \
13
+ git \
14
+ wget \
15
+ procps \
16
+ git-lfs \
17
+ zip \
18
+ unzip \
19
+ htop \
20
+ vim \
21
+ nano \
22
+ bzip2 \
23
+ libx11-6 \
24
+ build-essential \
25
+ libsndfile-dev \
26
+ software-properties-common \
27
+ && rm -rf /var/lib/apt/lists/*
28
+
29
+ RUN add-apt-repository ppa:flexiondotorg/nvtop && \
30
+ apt-get upgrade -y && \
31
+ apt-get install -y --no-install-recommends nvtop
32
+
33
+ RUN curl -sL https://deb.nodesource.com/setup_20.x | bash - && \
34
+ apt-get install -y nodejs && \
35
+ npm install -g configurable-http-proxy
36
+
37
+ # Create a working directory
38
+ WORKDIR /app
39
+
40
+ # Create a non-root user and switch to it
41
+ RUN adduser --disabled-password --gecos '' --shell /bin/bash user \
42
+ && chown -R user:user /app
43
+ RUN echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user
44
+ USER user
45
+
46
+ # All users can use /home/user as their home directory
47
+ ENV HOME=/home/user
48
+ RUN mkdir $HOME/.cache $HOME/.config \
49
+ && chmod -R 777 $HOME
50
+
51
+ # Set up the Conda environment
52
+ ENV CONDA_AUTO_UPDATE_CONDA=false \
53
+ PATH=$HOME/miniconda/bin:$PATH
54
+ RUN curl -sLo ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh \
55
+ && chmod +x ~/miniconda.sh \
56
+ && ~/miniconda.sh -b -p ~/miniconda \
57
+ && rm ~/miniconda.sh \
58
+ && conda clean -ya
59
+
60
+ WORKDIR $HOME/app
61
+
62
+ #######################################
63
+ # Start root user section
64
+ #######################################
65
+
66
+ USER root
67
+
68
+ # User Debian packages
69
+ ## Security warning : Potential user code executed as root (build time)
70
+ RUN --mount=target=/root/packages.txt,source=packages.txt \
71
+ apt-get update && \
72
+ xargs -r -a /root/packages.txt apt-get install -y --no-install-recommends \
73
+ && rm -rf /var/lib/apt/lists/*
74
+
75
+ RUN --mount=target=/root/on_startup.sh,source=on_startup.sh,readwrite \
76
+ bash /root/on_startup.sh
77
+
78
+ RUN mkdir /data && chown user:user /data
79
+
80
+ #######################################
81
+ # End root user section
82
+ #######################################
83
+
84
+ USER user
85
+
86
+ # Python packages
87
+ RUN --mount=target=requirements.txt,source=requirements.txt \
88
+ pip install --no-cache-dir --upgrade -r requirements.txt
89
+
90
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
91
+ COPY --chown=user . $HOME/app
92
+
93
+ RUN chmod +x start_server.sh
94
+
95
+ COPY --chown=user login.html /home/user/miniconda/lib/python3.9/site-packages/jupyter_server/templates/login.html
96
+
97
+ ENV PYTHONUNBUFFERED=1 \
98
+ GRADIO_ALLOW_FLAGGING=never \
99
+ GRADIO_NUM_PORTS=1 \
100
+ GRADIO_SERVER_NAME=0.0.0.0 \
101
+ GRADIO_THEME=huggingface \
102
+ HF_HOME=/data/.cache/huggingface \
103
+ SYSTEM=spaces \
104
+ SHELL=/bin/bash
105
+
106
+ CMD ["./start_server.sh"]
README.md CHANGED
@@ -1,10 +1,15 @@
1
  ---
2
- title: Spark On HF JupyterLab
3
- emoji: 👁
4
- colorFrom: purple
5
- colorTo: blue
6
  sdk: docker
7
  pinned: false
 
 
 
 
 
8
  ---
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Spark on HF JupyterLab
3
+ emoji: 🌅
4
+ colorFrom: gray
5
+ colorTo: red
6
  sdk: docker
7
  pinned: false
8
+ tags:
9
+ - jupyterlab
10
+ - spark
11
+ - datasets
12
+ suggested_storage: small
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
data/hf_spark_utils.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import pickle
3
+ import tempfile
4
+ from functools import partial
5
+ from typing import Iterator, Optional, Union
6
+
7
+ import pyarrow as pa
8
+ import pyarrow.parquet as pq
9
+ from huggingface_hub import CommitOperationAdd, HfFileSystem
10
+ from pyspark.sql.dataframe import DataFrame
11
+ from pyspark.sql.pandas.types import from_arrow_schema, to_arrow_schema
12
+
13
+ spark = None
14
+
15
+ def set_session(session):
16
+ global spark
17
+ spark = session
18
+
19
+
20
+ def _read(iterator: Iterator[pa.RecordBatch], columns: Optional[list[str]], filters: Optional[Union[list[tuple], list[list[tuple]]]], **kwargs) -> Iterator[pa.RecordBatch]:
21
+ for batch in iterator:
22
+ paths = batch[0].to_pylist()
23
+ ds = pq.ParquetDataset(paths, **kwargs)
24
+ yield from ds._dataset.to_batches(columns=columns, filter=pq.filters_to_expression(filters) if filters else None)
25
+
26
+
27
+ def read_parquet(
28
+ path: str,
29
+ columns: Optional[list[str]] = None,
30
+ filters: Optional[Union[list[tuple], list[list[tuple]]]] = None,
31
+ **kwargs,
32
+ ) -> DataFrame:
33
+ """
34
+ Loads Parquet files from Hugging Face using PyArrow, returning a PySPark `DataFrame`.
35
+
36
+ It reads Parquet files in a distributed manner.
37
+
38
+ Access private or gated repositories using `huggingface-cli login` or passing a token
39
+ using the `storage_options` argument: `storage_options={"token": "hf_xxx"}`
40
+
41
+ Parameters
42
+ ----------
43
+ path : str
44
+ Path to the file. Prefix with a protocol like `hf://` to read from Hugging Face.
45
+ You can read from multiple files if you pass a globstring.
46
+ columns : list, default None
47
+ If not None, only these columns will be read from the file.
48
+ filters : List[Tuple] or List[List[Tuple]], default None
49
+ To filter out data.
50
+ Filter syntax: [[(column, op, val), ...],...]
51
+ where op is [==, =, >, >=, <, <=, !=, in, not in]
52
+ The innermost tuples are transposed into a set of filters applied
53
+ through an `AND` operation.
54
+ The outer list combines these sets of filters through an `OR`
55
+ operation.
56
+ A single list of tuples can also be used, meaning that no `OR`
57
+ operation between set of filters is to be conducted.
58
+
59
+ **kwargs
60
+ Any additional kwargs are passed to pyarrow.parquet.ParquetDataset.
61
+
62
+ Returns
63
+ -------
64
+ DataFrame
65
+ DataFrame based on parquet file.
66
+
67
+ Examples
68
+ --------
69
+ >>> path = "hf://datasets/username/dataset/data.parquet"
70
+ >>> pd.DataFrame({"foo": range(5), "bar": range(5, 10)}).to_parquet(path)
71
+ >>> read_parquet(path).show()
72
+ +---+---+
73
+ |foo|bar|
74
+ +---+---+
75
+ | 0| 5|
76
+ | 1| 6|
77
+ | 2| 7|
78
+ | 3| 8|
79
+ | 4| 9|
80
+ +---+---+
81
+ >>> read_parquet(path, columns=["bar"]).show()
82
+ +---+
83
+ |bar|
84
+ +---+
85
+ | 5|
86
+ | 6|
87
+ | 7|
88
+ | 8|
89
+ | 9|
90
+ +---+
91
+ >>> sel = [("foo", ">", 2)]
92
+ >>> read_parquet(path, filters=sel).show()
93
+ +---+---+
94
+ |foo|bar|
95
+ +---+---+
96
+ | 3| 8|
97
+ | 4| 9|
98
+ +---+---+
99
+ """
100
+ filesystem: HfFileSystem = kwargs.pop("filesystem") if "filesystem" in kwargs else HfFileSystem(**kwargs.pop("storage_options", {}))
101
+ paths = filesystem.glob(path)
102
+ if not paths:
103
+ raise FileNotFoundError(f"Counldn't find any file at {path}")
104
+ rdd = spark.sparkContext.parallelize([{"path": path} for path in paths], len(paths))
105
+ df = spark.createDataFrame(rdd)
106
+ arrow_schema = pq.read_schema(filesystem.open(paths[0]))
107
+ schema = pa.schema([field for field in arrow_schema if (columns is None or field.name in columns)], metadata=arrow_schema.metadata)
108
+ return df.mapInArrow(
109
+ partial(_read, columns=columns, filters=filters, filesystem=filesystem, schema=arrow_schema, **kwargs),
110
+ from_arrow_schema(schema),
111
+ )
112
+
113
+
114
+ def _preupload(iterator: Iterator[pa.RecordBatch], path: str, schema: pa.Schema, filesystem: HfFileSystem, row_group_size: Optional[int] = None, **kwargs) -> Iterator[pa.RecordBatch]:
115
+ resolved_path = filesystem.resolve_path(path)
116
+ with tempfile.NamedTemporaryFile(suffix=".parquet") as temp_file:
117
+ with pq.ParquetWriter(temp_file.name, schema=schema, **kwargs) as writer:
118
+ for batch in iterator:
119
+ writer.write_batch(batch, row_group_size=row_group_size)
120
+ addition = CommitOperationAdd(path_in_repo=temp_file.name, path_or_fileobj=temp_file.name)
121
+ filesystem._api.preupload_lfs_files(repo_id=resolved_path.repo_id, additions=[addition], repo_type=resolved_path.repo_type, revision=resolved_path.revision)
122
+ yield pa.record_batch({"addition": [pickle.dumps(addition)]}, schema=pa.schema({"addition": pa.binary()}))
123
+
124
+
125
+ def _commit(iterator: Iterator[pa.RecordBatch], path: str, filesystem: HfFileSystem, max_operations_per_commit=50) -> Iterator[pa.RecordBatch]:
126
+ resolved_path = filesystem.resolve_path(path)
127
+ additions: list[CommitOperationAdd] = [pickle.loads(addition) for addition in pa.Table.from_batches(iterator, schema=pa.schema({"addition": pa.binary()}))[0].to_pylist()]
128
+ num_commits = math.ceil(len(additions) / max_operations_per_commit)
129
+ for shard_idx, addition in enumerate(additions):
130
+ addition.path_in_repo = resolved_path.path_in_repo.replace("{shard_idx:05d}", f"{shard_idx:05d}")
131
+ for i in range(0, num_commits):
132
+ operations = additions[i * max_operations_per_commit : (i + 1) * max_operations_per_commit]
133
+ commit_message = "Upload using PySpark" + (f" (part {i:05d}-of-{num_commits:05d})" if num_commits > 1 else "")
134
+ filesystem._api.create_commit(repo_id=resolved_path.repo_id, repo_type=resolved_path.repo_type, revision=resolved_path.revision, operations=operations, commit_message=commit_message)
135
+ yield pa.record_batch({"path": [addition.path_in_repo for addition in operations]}, schema=pa.schema({"path": pa.string()}))
136
+
137
+
138
+ def write_parquet(df: DataFrame, path: str, **kwargs) -> None:
139
+ """
140
+ Write Parquet files to Hugging Face using PyArrow.
141
+
142
+ It uploads Parquet files in a distributed manner in two steps:
143
+
144
+ 1. Preupload the Parquet files in parallel in a distributed banner
145
+ 2. Commit the preuploaded files
146
+
147
+ Authenticate using `huggingface-cli login` or passing a token
148
+ using the `storage_options` argument: `storage_options={"token": "hf_xxx"}`
149
+
150
+ Parameters
151
+ ----------
152
+ path : str
153
+ Path of the file or directory. Prefix with a protocol like `hf://` to read from Hugging Face.
154
+ It writes Parquet files in the form "part-xxxxx.parquet", or to a single file if `path ends with ".parquet".
155
+
156
+ **kwargs
157
+ Any additional kwargs are passed to pyarrow.parquet.ParquetWriter.
158
+
159
+ Returns
160
+ -------
161
+ DataFrame
162
+ DataFrame based on parquet file.
163
+
164
+ Examples
165
+ --------
166
+ >>> spark.createDataFrame(pd.DataFrame({"foo": range(5), "bar": range(5, 10)}))
167
+ >>> # Save to one file
168
+ >>> write_parquet(df, "hf://datasets/username/dataset/data.parquet")
169
+ >>> # OR save to a directory (possibly in many files)
170
+ >>> write_parquet(df, "hf://datasets/username/dataset")
171
+ """
172
+ filesystem: HfFileSystem = kwargs.pop("filesystem", HfFileSystem(**kwargs.pop("storage_options", {})))
173
+ if path.endswith(".parquet") or path.endswith(".pq"):
174
+ df = df.coalesce(1)
175
+ else:
176
+ path += "/part-{shard_idx:05d}.parquet"
177
+ df.mapInArrow(
178
+ partial(_preupload, path=path, schema=to_arrow_schema(df.schema), filesystem=filesystem, **kwargs),
179
+ from_arrow_schema(pa.schema({"addition": pa.binary()})),
180
+ ).coalesce(1).mapInArrow(
181
+ partial(_commit, path=path, filesystem=filesystem),
182
+ from_arrow_schema(pa.schema({"path": pa.string()})),
183
+ ).collect()
data/spark.ipynb ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "6fb06d81-1778-403c-b15b-d68200a5e6b5",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Spark on Hugging Face"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "id": "7399a5ed-aea8-45cf-866f-2decd7097456",
15
+ "metadata": {
16
+ "tags": []
17
+ },
18
+ "outputs": [],
19
+ "source": [
20
+ "from pyspark.sql import SparkSession\n",
21
+ "spark = SparkSession.builder.appName(\"demo\").getOrCreate()"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "markdown",
26
+ "id": "8bf07f63-6fed-4cf9-8fee-5f3a5fb6bed1",
27
+ "metadata": {
28
+ "tags": []
29
+ },
30
+ "source": [
31
+ "Example:\n",
32
+ "\n",
33
+ "```python\n",
34
+ "# Load the BAAI/Infinity-Instruct dataset\n",
35
+ "df = read_parquet(\"hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet\")\n",
36
+ "\n",
37
+ "# Load only one column\n",
38
+ "df_langdetect_only = read_parquet(\"hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet\", columns=[\"langdetect\"])\n",
39
+ "\n",
40
+ "# Load values within certain ranges\n",
41
+ "criteria = [(\"langdetect\", \"=\", \"zh-cn\")]\n",
42
+ "df_chinese_only = read_parquet(\"hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet\", filters=criteria)\n",
43
+ "\n",
44
+ "# Save dataset\n",
45
+ "write_parquet(df_chinese_only, \"hf://datasets/username/Infinity-Instruct-Chinese-Only\")\n",
46
+ "```"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": null,
52
+ "id": "ca71b3ac-3291-4e4e-8fee-b3550b0426d6",
53
+ "metadata": {
54
+ "tags": []
55
+ },
56
+ "outputs": [],
57
+ "source": [
58
+ "from hf_spark_utils import read_parquet, write_parquet, set_session\n",
59
+ "set_session(spark)"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "markdown",
64
+ "id": "07ea62a4-7549-4a75-8a12-9d830f6e3cde",
65
+ "metadata": {},
66
+ "source": [
67
+ "#### (Optional) Login"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": null,
73
+ "id": "343b3a9a-2dce-492b-9384-703368ba3975",
74
+ "metadata": {
75
+ "tags": []
76
+ },
77
+ "outputs": [],
78
+ "source": [
79
+ "from huggingface_hub import notebook_login\n",
80
+ "notebook_login(new_session=False)"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "markdown",
85
+ "id": "332b7609-f0eb-4703-aea6-fec3d09f5870",
86
+ "metadata": {},
87
+ "source": [
88
+ "#### Run your code:"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "execution_count": null,
94
+ "id": "6c0dfe01-9190-454c-9c52-216f74d339e1",
95
+ "metadata": {},
96
+ "outputs": [],
97
+ "source": []
98
+ }
99
+ ],
100
+ "metadata": {
101
+ "kernelspec": {
102
+ "display_name": "Python 3 (ipykernel)",
103
+ "language": "python",
104
+ "name": "python3"
105
+ },
106
+ "language_info": {
107
+ "codemirror_mode": {
108
+ "name": "ipython",
109
+ "version": 3
110
+ },
111
+ "file_extension": ".py",
112
+ "mimetype": "text/x-python",
113
+ "name": "python",
114
+ "nbconvert_exporter": "python",
115
+ "pygments_lexer": "ipython3",
116
+ "version": "3.9.5"
117
+ }
118
+ },
119
+ "nbformat": 4,
120
+ "nbformat_minor": 5
121
+ }
login.html ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends "page.html" %}
2
+
3
+
4
+ {% block stylesheet %}
5
+ {% endblock %}
6
+
7
+ {% block site %}
8
+
9
+ <div id="jupyter-main-app" class="container">
10
+
11
+ <img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face Logo" style="height: 96px; vertical-align: bottom;"><span style="font-size: 52px;">×</span><img src="https://upload.wikimedia.org/wikipedia/commons/f/f3/Apache_Spark_logo.svg" alt="Apache Spark Logo" style="height: 96px; vertical-align: bottom;">
12
+ <h4>You must duplicate this Space to use it.</h4>
13
+ <br>
14
+ <a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/DockerTemplates/jupyterlab?duplicate=true">
15
+ <img style="margin: 0" src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&amp;style=flat&amp;logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&amp;logoWidth=14" alt="Duplicate Space"></a>
16
+ <br>
17
+ <br>
18
+ <h4>The default token is <span style="color:orange;">huggingface</span></h4>
19
+ <h4>Duplicate the Space to run your own instance</h4>
20
+
21
+ {% if login_available %}
22
+ {# login_available means password-login is allowed. Show the form. #}
23
+ <div class="row">
24
+ <div class="navbar col-sm-8">
25
+ <div class="navbar-inner">
26
+ <div class="container">
27
+ <div class="center-nav">
28
+ <form action="{{base_url}}login?next={{next}}" method="post" class="navbar-form pull-left">
29
+ {{ xsrf_form_html() | safe }}
30
+ {% if token_available %}
31
+ <label for="password_input"><strong>{% trans %}Token:{% endtrans
32
+ %}</strong></label>
33
+ {% else %}
34
+ <label for="password_input"><strong>{% trans %}Password:{% endtrans %}</strong></label>
35
+ {% endif %}
36
+ <input type="password" name="password" id="password_input" class="form-control">
37
+ <button type="submit" class="btn btn-default" id="login_submit">{% trans %}Log in{% endtrans
38
+ %}</button>
39
+ </form>
40
+ </div>
41
+ </div>
42
+ </div>
43
+ </div>
44
+ </div>
45
+ {% else %}
46
+ <p>{% trans %}No login available, you shouldn't be seeing this page.{% endtrans %}</p>
47
+ {% endif %}
48
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/spark-ex-min.png" alt="Spark on Hugging Face example Python code" style="width: 100%; margin-bottom: 40px; border-radius: 5px; box-shadow: rgba(149, 157, 165, 0.2) 0px 8px 24px; border: 1rem solid white;">
49
+ <p>This template was created by <a href="https://twitter.com/camenduru" target="_blank" >camenduru</a> and <a href="https://huggingface.co/nateraw" target="_blank" >nateraw</a>, with contributions of <a href="https://huggingface.co/osanseviero" target="_blank" >osanseviero</a>, <a href="https://huggingface.co/azzr" target="_blank" >azzr</a> and <a href="https://huggingface.co/lhoestq" target="_blank">lhoestq</a></p>
50
+ {% if message %}
51
+ <div class="row">
52
+ {% for key in message %}
53
+ <div class="message {{key}}">
54
+ {{message[key]}}
55
+ </div>
56
+ {% endfor %}
57
+ </div>
58
+ {% endif %}
59
+ {% if token_available %}
60
+ {% block token_message %}
61
+
62
+ {% endblock token_message %}
63
+ {% endif %}
64
+ </div>
65
+
66
+ {% endblock %}
67
+
68
+
69
+ {% block script %}
70
+ {% endblock %}
on_startup.sh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Write some commands here that will run on root user before startup.
3
+ # For example, to clone transformers and install it in dev mode:
4
+ # git clone https://github.com/huggingface/transformers.git
5
+ # cd transformers && pip install -e ".[dev]"
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ tree
2
+ openjdk-8-jdk
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ jupyterlab==3.6.1
2
+ jupyter-server==2.3.0
3
+ tornado==6.2
4
+ ipywidgets
5
+ huggingface_hub
6
+ pyarrow
7
+ pyspark[sql,pandas_on_spark]
8
+ plotly
start_server.sh ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ JUPYTER_TOKEN="${JUPYTER_TOKEN:=huggingface}"
3
+
4
+ echo "Starting Jupyter Lab with token $JUPYTER_TOKEN"
5
+
6
+ NOTEBOOK_DIR="/data"
7
+ cp -n data/hf_spark_utils.py $NOTEBOOK_DIR/hf_spark_utils.py
8
+ cp -n data/spark.ipynb $NOTEBOOK_DIR/spark.ipynb
9
+ DEFAULT_URL="/lab/tree/spark.ipynb"
10
+
11
+ jupyter-lab \
12
+ --ip 0.0.0.0 \
13
+ --port 7860 \
14
+ --no-browser \
15
+ --allow-root \
16
+ --ServerApp.token="$JUPYTER_TOKEN" \
17
+ --ServerApp.tornado_settings="{'headers': {'Content-Security-Policy': 'frame-ancestors *'}}" \
18
+ --ServerApp.cookie_options="{'SameSite': 'None', 'Secure': True}" \
19
+ --ServerApp.disable_check_xsrf=True \
20
+ --LabApp.news_url=None \
21
+ --LabApp.check_for_updates_class="jupyterlab.NeverCheckForUpdate" \
22
+ --LabApp.default_url=$DEFAULT_URL \
23
+ --notebook-dir=$NOTEBOOK_DIR