MasonCrinr commited on
Commit
762a084
1 Parent(s): 30a0b0e

Upload 580 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. .github/workflows +39 -0
  3. .gitignore +135 -0
  4. Advanced_Usage.md +103 -0
  5. CHANGELOG.md +36 -0
  6. CITATION.cff +10 -0
  7. Dockerfile +34 -0
  8. LICENSE +201 -0
  9. MANIFEST.in +2 -0
  10. README.md +227 -0
  11. examples/favorite_riding_hood.mp3 +0 -0
  12. examples/favorites/atkins_mha.mp3 +0 -0
  13. examples/favorites/atkins_omicron.mp3 +0 -0
  14. examples/favorites/atkins_value.mp3 +0 -0
  15. examples/favorites/daniel_craig_dumbledore.mp3 +0 -0
  16. examples/favorites/daniel_craig_training_ethics.mp3 +0 -0
  17. examples/favorites/dotrice_stop_for_death.mp3 +0 -0
  18. examples/favorites/emma_stone_courage.mp3 +0 -0
  19. examples/favorites/emma_stone_training_ethics.mp3 +0 -0
  20. examples/favorites/halle_barry_dumbledore.mp3 +0 -0
  21. examples/favorites/halle_barry_oar_to_oar.mp3 +0 -0
  22. examples/favorites/henry_cavill_metallic_hydrogen.mp3 +0 -0
  23. examples/favorites/kennard_road_not_taken.mp3 +0 -0
  24. examples/favorites/morgan_freeman_metallic_hydrogen.mp3 +0 -0
  25. examples/favorites/myself_gatsby.mp3 +0 -0
  26. examples/favorites/patrick_stewart_omicron.mp3 +0 -0
  27. examples/favorites/patrick_stewart_secret_of_life.mp3 +0 -0
  28. examples/favorites/robert_deniro_review.mp3 +0 -0
  29. examples/favorites/william_shatner_spacecraft_interview.mp3 +0 -0
  30. examples/finetuned/lj/1.mp3 +0 -0
  31. examples/finetuned/lj/2.mp3 +0 -0
  32. examples/finetuned/lj/3.mp3 +0 -0
  33. examples/finetuned/lj/4.mp3 +0 -0
  34. examples/naturalspeech_comparison/fibers/naturalspeech.mp3 +0 -0
  35. examples/naturalspeech_comparison/fibers/tortoise.mp3 +0 -0
  36. examples/naturalspeech_comparison/lax/naturalspeech.mp3 +0 -0
  37. examples/naturalspeech_comparison/lax/tortoise.mp3 +0 -0
  38. examples/naturalspeech_comparison/maltby/naturalspeech.mp3 +0 -0
  39. examples/naturalspeech_comparison/maltby/tortoise.mp3 +0 -0
  40. examples/prompting/angry.mp3 +0 -0
  41. examples/prompting/happy.mp3 +0 -0
  42. examples/prompting/sad.mp3 +0 -0
  43. examples/prompting/scared.mp3 +0 -0
  44. examples/riding_hood/angelina.mp3 +0 -0
  45. examples/riding_hood/craig.mp3 +0 -0
  46. examples/riding_hood/deniro.mp3 +0 -0
  47. examples/riding_hood/emma.mp3 +0 -0
  48. examples/riding_hood/freeman.mp3 +0 -0
  49. examples/riding_hood/geralt.mp3 +0 -0
  50. examples/riding_hood/halle.mp3 +0 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tortoise/voices/angie/2.wav filter=lfs diff=lfs merge=lfs -text
37
+ tortoise/voices/deniro/2.wav filter=lfs diff=lfs merge=lfs -text
38
+ tortoise/voices/train_lescault/lescault_new4.wav filter=lfs diff=lfs merge=lfs -text
.github/workflows ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow will upload a Python Package using Twine when a release is created
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3
+
4
+ # This workflow uses actions that are not certified by GitHub.
5
+ # They are provided by a third-party and are governed by
6
+ # separate terms of service, privacy policy, and support
7
+ # documentation.
8
+
9
+ name: Upload Python Package
10
+
11
+ on:
12
+ release:
13
+ types: [published]
14
+
15
+ permissions:
16
+ contents: read
17
+
18
+ jobs:
19
+ deploy:
20
+
21
+ runs-on: ubuntu-latest
22
+
23
+ steps:
24
+ - uses: actions/checkout@v3
25
+ - name: Set up Python
26
+ uses: actions/setup-python@v3
27
+ with:
28
+ python-version: '3.x'
29
+ - name: Install dependencies
30
+ run: |
31
+ python -m pip install --upgrade pip
32
+ pip install build
33
+ - name: Build package
34
+ run: python -m build
35
+ - name: Publish package
36
+ uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37
+ with:
38
+ user: __token__
39
+ password: ${{ secrets.PYPI_API_TOKEN }}
.gitignore ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ .idea/*
132
+ .models/*
133
+ .custom/*
134
+ results/*
135
+ debug_states/*
Advanced_Usage.md ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Advanced Usage
2
+
3
+ ### Generation settings
4
+
5
+ Tortoise is primarily an autoregressive decoder model combined with a diffusion model. Both of these have a lot of knobs
6
+ that can be turned that I've abstracted away for the sake of ease of use. I did this by generating thousands of clips using
7
+ various permutations of the settings and using a metric for voice realism and intelligibility to measure their effects. I've
8
+ set the defaults to the best overall settings I was able to find. For specific use-cases, it might be effective to play with
9
+ these settings (and it's very likely that I missed something!)
10
+
11
+ These settings are not available in the normal scripts packaged with Tortoise. They are available, however, in the API. See
12
+ ```api.tts``` for a full list.
13
+
14
+ ### Prompt engineering
15
+
16
+ Some people have discovered that it is possible to do prompt engineering with Tortoise! For example, you can evoke emotion
17
+ by including things like "I am really sad," before your text. I've built an automated redaction system that you can use to
18
+ take advantage of this. It works by attempting to redact any text in the prompt surrounded by brackets. For example, the
19
+ prompt "\[I am really sad,\] Please feed me." will only speak the words "Please feed me" (with a sad tonality).
20
+
21
+ ### Playing with the voice latent
22
+
23
+ Tortoise ingests reference clips by feeding them through individually through a small submodel that produces a point latent,
24
+ then taking the mean of all of the produced latents. The experimentation I have done has indicated that these point latents
25
+ are quite expressive, affecting everything from tone to speaking rate to speech abnormalities.
26
+
27
+ This lends itself to some neat tricks. For example, you can combine feed two different voices to tortoise and it will output
28
+ what it thinks the "average" of those two voices sounds like.
29
+
30
+ #### Generating conditioning latents from voices
31
+
32
+ Use the script `get_conditioning_latents.py` to extract conditioning latents for a voice you have installed. This script
33
+ will dump the latents to a .pth pickle file. The file will contain a single tuple, (autoregressive_latent, diffusion_latent).
34
+
35
+ Alternatively, use the api.TextToSpeech.get_conditioning_latents() to fetch the latents.
36
+
37
+ #### Using raw conditioning latents to generate speech
38
+
39
+ After you've played with them, you can use them to generate speech by creating a subdirectory in voices/ with a single
40
+ ".pth" file containing the pickled conditioning latents as a tuple (autoregressive_latent, diffusion_latent).
41
+
42
+ ## Tortoise-detect
43
+
44
+ Out of concerns that this model might be misused, I've built a classifier that tells the likelihood that an audio clip
45
+ came from Tortoise.
46
+
47
+ This classifier can be run on any computer, usage is as follows:
48
+
49
+ ```commandline
50
+ python tortoise/is_this_from_tortoise.py --clip=<path_to_suspicious_audio_file>
51
+ ```
52
+
53
+ This model has 100% accuracy on the contents of the results/ and voices/ folders in this repo. Still, treat this classifier
54
+ as a "strong signal". Classifiers can be fooled and it is likewise not impossible for this classifier to exhibit false
55
+ positives.
56
+
57
+ ## Model architecture
58
+
59
+ Tortoise TTS is inspired by OpenAI's DALLE, applied to speech data and using a better decoder. It is made up of 5 separate
60
+ models that work together. I've assembled a write-up of the system architecture here:
61
+ [https://nonint.com/2022/04/25/tortoise-architectural-design-doc/](https://nonint.com/2022/04/25/tortoise-architectural-design-doc/)
62
+
63
+ ## Training
64
+
65
+ These models were trained on my "homelab" server with 8 RTX 3090s over the course of several months. They were trained on a dataset consisting of
66
+ ~50k hours of speech data, most of which was transcribed by [ocotillo](http://www.github.com/neonbjb/ocotillo). Training was done on my own
67
+ [DLAS](https://github.com/neonbjb/DL-Art-School) trainer.
68
+
69
+ I currently do not have plans to release the training configurations or methodology. See the next section..
70
+
71
+ ## Ethical Considerations
72
+
73
+ Tortoise v2 works considerably better than I had planned. When I began hearing some of the outputs of the last few versions, I began
74
+ wondering whether or not I had an ethically unsound project on my hands. The ways in which a voice-cloning text-to-speech system
75
+ could be misused are many. It doesn't take much creativity to think up how.
76
+
77
+ After some thought, I have decided to go forward with releasing this. Following are the reasons for this choice:
78
+
79
+ 1. It is primarily good at reading books and speaking poetry. Other forms of speech do not work well.
80
+ 2. It was trained on a dataset which does not have the voices of public figures. While it will attempt to mimic these voices if they are provided as references, it does not do so in such a way that most humans would be fooled.
81
+ 3. The above points could likely be resolved by scaling up the model and the dataset. For this reason, I am currently withholding details on how I trained the model, pending community feedback.
82
+ 4. I am releasing a separate classifier model which will tell you whether a given audio clip was generated by Tortoise or not. See `tortoise-detect` above.
83
+ 5. If I, a tinkerer with a BS in computer science with a ~$15k computer can build this, then any motivated corporation or state can as well. I would prefer that it be in the open and everyone know the kinds of things ML can do.
84
+
85
+ ### Diversity
86
+
87
+ The diversity expressed by ML models is strongly tied to the datasets they were trained on.
88
+
89
+ Tortoise was trained primarily on a dataset consisting of audiobooks. I made no effort to
90
+ balance diversity in this dataset. For this reason, Tortoise will be particularly poor at generating the voices of minorities
91
+ or of people who speak with strong accents.
92
+
93
+ ## Looking forward
94
+
95
+ Tortoise v2 is about as good as I think I can do in the TTS world with the resources I have access to. A phenomenon that happens when
96
+ training very large models is that as parameter count increases, the communication bandwidth needed to support distributed training
97
+ of the model increases multiplicatively. On enterprise-grade hardware, this is not an issue: GPUs are attached together with
98
+ exceptionally wide buses that can accommodate this bandwidth. I cannot afford enterprise hardware, though, so I am stuck.
99
+
100
+ I want to mention here
101
+ that I think Tortoise could be a **lot** better. The three major components of Tortoise are either vanilla Transformer Encoder stacks
102
+ or Decoder stacks. Both of these types of models have a rich experimental history with scaling in the NLP realm. I see no reason
103
+ to believe that the same is not true of TTS.
CHANGELOG.md ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Changelog
2
+ #### v3.0.0; 2023/10/18
3
+ - Added fast inference for tortoise with HiFi Decoder (inspired by xtts by [coquiTTS](https://github.com/coqui-ai/TTS) 🐸, check out their multilingual model for noncommercial uses)
4
+ #### v2.8.0; 2023/9/13
5
+ - Added custom tokenizer for non-english models
6
+ #### v2.7.0; 2023/7/26
7
+ - Bug fixes
8
+ - Added Apple Silicon Support
9
+ - Updated Transformer version
10
+ #### v2.6.0; 2023/7/26
11
+ - Bug fixes
12
+
13
+ #### v2.5.0; 2023/7/09
14
+ - Added kv_cache support 5x faster
15
+ - Added deepspeed support 10x faster
16
+ - Added half precision support
17
+
18
+ #### v2.4.0; 2022/5/17
19
+ - Removed CVVP model. Found that it does not, in fact, make an appreciable difference in the output.
20
+ - Add better debugging support; existing tools now spit out debug files which can be used to reproduce bad runs.
21
+
22
+ #### v2.3.0; 2022/5/12
23
+ - New CLVP-large model for further improved decoding guidance.
24
+ - Improvements to read.py and do_tts.py (new options)
25
+
26
+ #### v2.2.0; 2022/5/5
27
+ - Added several new voices from the training set.
28
+ - Automated redaction. Wrap the text you want to use to prompt the model but not be spoken in brackets.
29
+ - Bug fixes
30
+
31
+ #### v2.1.0; 2022/5/2
32
+ - Added ability to produce totally random voices.
33
+ - Added ability to download voice conditioning latent via a script, and then use a user-provided conditioning latent.
34
+ - Added ability to use your own pretrained models.
35
+ - Refactored directory structures.
36
+ - Performance improvements & bug fixes.
CITATION.cff ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ cff-version: 1.3.0
2
+ message: "If you use this software, please cite it as below."
3
+ authors:
4
+ - family-names: "Betker"
5
+ given-names: "James"
6
+ orcid: "https://orcid.org/my-orcid?orcid=0000-0003-3259-4862"
7
+ title: "TorToiSe text-to-speech"
8
+ version: 2.0
9
+ date-released: 2022-04-28
10
+ url: "https://github.com/neonbjb/tortoise-tts"
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.2.0-base-ubuntu22.04
2
+
3
+ COPY . /app
4
+
5
+ RUN apt-get update && \
6
+ apt-get install -y --allow-unauthenticated --no-install-recommends \
7
+ wget \
8
+ git \
9
+ && apt-get autoremove -y \
10
+ && apt-get clean -y \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ ENV HOME "/root"
14
+ ENV CONDA_DIR "${HOME}/miniconda"
15
+ ENV PATH="$CONDA_DIR/bin":$PATH
16
+ ENV CONDA_AUTO_UPDATE_CONDA=false
17
+ ENV PIP_DOWNLOAD_CACHE="$HOME/.pip/cache"
18
+ ENV TORTOISE_MODELS_DIR
19
+
20
+ RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh \
21
+ && bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u \
22
+ && "${CONDA_DIR}/bin/conda" init bash \
23
+ && rm -f /tmp/miniconda3.sh \
24
+ && echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile"
25
+
26
+ # --login option used to source bashrc (thus activating conda env) at every RUN statement
27
+ SHELL ["/bin/bash", "--login", "-c"]
28
+
29
+ RUN conda create --name tortoise python=3.9 numba inflect \
30
+ && conda activate tortoise \
31
+ && conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia \
32
+ && conda install transformers=4.29.2 \
33
+ && cd /app \
34
+ && python setup.py install
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
MANIFEST.in ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ recursive-include tortoise/data *
2
+ recursive-include tortoise/voices *
README.md ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TorToiSe
2
+
3
+ Tortoise is a text-to-speech program built with the following priorities:
4
+
5
+ 1. Strong multi-voice capabilities.
6
+ 2. Highly realistic prosody and intonation.
7
+
8
+ This repo contains all the code needed to run Tortoise TTS in inference mode.
9
+
10
+ Manuscript: https://arxiv.org/abs/2305.07243
11
+ ## Hugging Face space
12
+
13
+ A live demo is hosted on Hugging Face Spaces. If you'd like to avoid a queue, please duplicate the Space and add a GPU. Please note that CPU-only spaces do not work for this demo.
14
+
15
+ https://huggingface.co/spaces/Manmay/tortoise-tts
16
+
17
+ ## Install via pip
18
+ ```bash
19
+ pip install tortoise-tts
20
+ ```
21
+
22
+ If you would like to install the latest development version, you can also install it directly from the git repository:
23
+
24
+ ```bash
25
+ pip install git+https://github.com/neonbjb/tortoise-tts
26
+ ```
27
+
28
+ ## What's in a name?
29
+
30
+ I'm naming my speech-related repos after Mojave desert flora and fauna. Tortoise is a bit tongue in cheek: this model
31
+ is insanely slow. It leverages both an autoregressive decoder **and** a diffusion decoder; both known for their low
32
+ sampling rates. On a K80, expect to generate a medium sized sentence every 2 minutes.
33
+
34
+ well..... not so slow anymore now we can get a **0.25-0.3 RTF** on 4GB vram and with streaming we can get < **500 ms** latency !!!
35
+
36
+ ## Demos
37
+
38
+ See [this page](http://nonint.com/static/tortoise_v2_examples.html) for a large list of example outputs.
39
+
40
+ A cool application of Tortoise + GPT-3 (not affiliated with this repository): https://twitter.com/lexman_ai. Unfortunately, this proejct seems no longer to be active.
41
+
42
+ ## Usage guide
43
+
44
+ ### Local installation
45
+
46
+ If you want to use this on your own computer, you must have an NVIDIA GPU.
47
+
48
+ On Windows, I **highly** recommend using the Conda installation path. I have been told that if you do not do this, you
49
+ will spend a lot of time chasing dependency problems.
50
+
51
+ First, install miniconda: https://docs.conda.io/en/latest/miniconda.html
52
+
53
+ Then run the following commands, using anaconda prompt as the terminal (or any other terminal configured to work with conda)
54
+
55
+ This will:
56
+ 1. create conda environment with minimal dependencies specified
57
+ 1. activate the environment
58
+ 1. install pytorch with the command provided here: https://pytorch.org/get-started/locally/
59
+ 1. clone tortoise-tts
60
+ 1. change the current directory to tortoise-tts
61
+ 1. run tortoise python setup install script
62
+
63
+ ```shell
64
+ conda create --name tortoise python=3.9 numba inflect
65
+ conda activate tortoise
66
+ conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia
67
+ conda install transformers=4.29.2
68
+ git clone https://github.com/neonbjb/tortoise-tts.git
69
+ cd tortoise-tts
70
+ python setup.py install
71
+ ```
72
+
73
+ Optionally, pytorch can be installed in the base environment, so that other conda environments can use it too. To do this, simply send the `conda install pytorch...` line before activating the tortoise environment.
74
+
75
+ > **Note:** When you want to use tortoise-tts, you will always have to ensure the `tortoise` conda environment is activated.
76
+
77
+ If you are on windows, you may also need to install pysoundfile: `conda install -c conda-forge pysoundfile`
78
+
79
+ ### Docker
80
+
81
+ An easy way to hit the ground running and a good jumping off point depending on your use case.
82
+
83
+ ```sh
84
+ git clone https://github.com/neonbjb/tortoise-tts.git
85
+ cd tortoise-tts
86
+
87
+ docker build . -t tts
88
+
89
+ docker run --gpus all \
90
+ -e TORTOISE_MODELS_DIR=/models \
91
+ -v /mnt/user/data/tortoise_tts/models:/models \
92
+ -v /mnt/user/data/tortoise_tts/results:/results \
93
+ -v /mnt/user/data/.cache/huggingface:/root/.cache/huggingface \
94
+ -v /root:/work \
95
+ -it tts
96
+ ```
97
+ This gives you an interactive terminal in an environment that's ready to do some tts. Now you can explore the different interfaces that tortoise exposes for tts.
98
+
99
+ For example:
100
+
101
+ ```sh
102
+ cd app
103
+ conda activate tortoise
104
+ time python tortoise/do_tts.py \
105
+ --output_path /results \
106
+ --preset ultra_fast \
107
+ --voice geralt \
108
+ --text "Time flies like an arrow; fruit flies like a bananna."
109
+ ```
110
+
111
+ ## Apple Silicon
112
+
113
+ On macOS 13+ with M1/M2 chips you need to install the nighly version of PyTorch, as stated in the official page you can do:
114
+
115
+ ```shell
116
+ pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
117
+ ```
118
+
119
+ Be sure to do that after you activate the environment. If you don't use conda the commands would look like this:
120
+
121
+ ```shell
122
+ python3.10 -m venv .venv
123
+ source .venv/bin/activate
124
+ pip install numba inflect psutil
125
+ pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
126
+ pip install transformers
127
+ git clone https://github.com/neonbjb/tortoise-tts.git
128
+ cd tortoise-tts
129
+ pip install .
130
+ ```
131
+
132
+ Be aware that DeepSpeed is disabled on Apple Silicon since it does not work. The flag `--use_deepspeed` is ignored.
133
+ You may need to prepend `PYTORCH_ENABLE_MPS_FALLBACK=1` to the commands below to make them work since MPS does not support all the operations in Pytorch.
134
+
135
+
136
+ ### do_tts.py
137
+
138
+ This script allows you to speak a single phrase with one or more voices.
139
+ ```shell
140
+ python tortoise/do_tts.py --text "I'm going to speak this" --voice random --preset fast
141
+ ```
142
+ ### faster inference read.py
143
+
144
+ This script provides tools for reading large amounts of text.
145
+
146
+ ```shell
147
+ python tortoise/read_fast.py --textfile <your text to be read> --voice random
148
+ ```
149
+
150
+ ### read.py
151
+
152
+ This script provides tools for reading large amounts of text.
153
+
154
+ ```shell
155
+ python tortoise/read.py --textfile <your text to be read> --voice random
156
+ ```
157
+
158
+ This will break up the textfile into sentences, and then convert them to speech one at a time. It will output a series
159
+ of spoken clips as they are generated. Once all the clips are generated, it will combine them into a single file and
160
+ output that as well.
161
+
162
+ Sometimes Tortoise screws up an output. You can re-generate any bad clips by re-running `read.py` with the --regenerate
163
+ argument.
164
+
165
+ ### API
166
+
167
+ Tortoise can be used programmatically, like so:
168
+
169
+ ```python
170
+ reference_clips = [utils.audio.load_audio(p, 22050) for p in clips_paths]
171
+ tts = api.TextToSpeech()
172
+ pcm_audio = tts.tts_with_preset("your text here", voice_samples=reference_clips, preset='fast')
173
+ ```
174
+
175
+ To use deepspeed:
176
+
177
+ ```python
178
+ reference_clips = [utils.audio.load_audio(p, 22050) for p in clips_paths]
179
+ tts = api.TextToSpeech(use_deepspeed=True)
180
+ pcm_audio = tts.tts_with_preset("your text here", voice_samples=reference_clips, preset='fast')
181
+ ```
182
+
183
+ To use kv cache:
184
+
185
+ ```python
186
+ reference_clips = [utils.audio.load_audio(p, 22050) for p in clips_paths]
187
+ tts = api.TextToSpeech(kv_cache=True)
188
+ pcm_audio = tts.tts_with_preset("your text here", voice_samples=reference_clips, preset='fast')
189
+ ```
190
+
191
+ To run model in float16:
192
+
193
+ ```python
194
+ reference_clips = [utils.audio.load_audio(p, 22050) for p in clips_paths]
195
+ tts = api.TextToSpeech(half=True)
196
+ pcm_audio = tts.tts_with_preset("your text here", voice_samples=reference_clips, preset='fast')
197
+ ```
198
+ for Faster runs use all three:
199
+
200
+ ```python
201
+ reference_clips = [utils.audio.load_audio(p, 22050) for p in clips_paths]
202
+ tts = api.TextToSpeech(use_deepspeed=True, kv_cache=True, half=True)
203
+ pcm_audio = tts.tts_with_preset("your text here", voice_samples=reference_clips, preset='fast')
204
+ ```
205
+
206
+ ## Acknowledgements
207
+
208
+ This project has garnered more praise than I expected. I am standing on the shoulders of giants, though, and I want to
209
+ credit a few of the amazing folks in the community that have helped make this happen:
210
+
211
+ - Hugging Face, who wrote the GPT model and the generate API used by Tortoise, and who hosts the model weights.
212
+ - [Ramesh et al](https://arxiv.org/pdf/2102.12092.pdf) who authored the DALLE paper, which is the inspiration behind Tortoise.
213
+ - [Nichol and Dhariwal](https://arxiv.org/pdf/2102.09672.pdf) who authored the (revision of) the code that drives the diffusion model.
214
+ - [Jang et al](https://arxiv.org/pdf/2106.07889.pdf) who developed and open-sourced univnet, the vocoder this repo uses.
215
+ - [Kim and Jung](https://github.com/mindslab-ai/univnet) who implemented univnet pytorch model.
216
+ - [lucidrains](https://github.com/lucidrains) who writes awesome open source pytorch models, many of which are used here.
217
+ - [Patrick von Platen](https://huggingface.co/patrickvonplaten) whose guides on setting up wav2vec were invaluable to building my dataset.
218
+
219
+ ## Notice
220
+
221
+ Tortoise was built entirely by the author (James Betker) using their own hardware. Their employer was not involved in any facet of Tortoise's development.
222
+
223
+ ## License
224
+
225
+ Tortoise TTS is licensed under the Apache 2.0 license.
226
+
227
+ If you use this repo or the ideas therein for your research, please cite it! A bibtex entree can be found in the right pane on GitHub.
examples/favorite_riding_hood.mp3 ADDED
Binary file (970 kB). View file
 
examples/favorites/atkins_mha.mp3 ADDED
Binary file (31.6 kB). View file
 
examples/favorites/atkins_omicron.mp3 ADDED
Binary file (41.3 kB). View file
 
examples/favorites/atkins_value.mp3 ADDED
Binary file (18.5 kB). View file
 
examples/favorites/daniel_craig_dumbledore.mp3 ADDED
Binary file (24 kB). View file
 
examples/favorites/daniel_craig_training_ethics.mp3 ADDED
Binary file (48.9 kB). View file
 
examples/favorites/dotrice_stop_for_death.mp3 ADDED
Binary file (28.8 kB). View file
 
examples/favorites/emma_stone_courage.mp3 ADDED
Binary file (34.1 kB). View file
 
examples/favorites/emma_stone_training_ethics.mp3 ADDED
Binary file (48 kB). View file
 
examples/favorites/halle_barry_dumbledore.mp3 ADDED
Binary file (21.5 kB). View file
 
examples/favorites/halle_barry_oar_to_oar.mp3 ADDED
Binary file (40.9 kB). View file
 
examples/favorites/henry_cavill_metallic_hydrogen.mp3 ADDED
Binary file (32 kB). View file
 
examples/favorites/kennard_road_not_taken.mp3 ADDED
Binary file (28.5 kB). View file
 
examples/favorites/morgan_freeman_metallic_hydrogen.mp3 ADDED
Binary file (35.4 kB). View file
 
examples/favorites/myself_gatsby.mp3 ADDED
Binary file (28.1 kB). View file
 
examples/favorites/patrick_stewart_omicron.mp3 ADDED
Binary file (37.6 kB). View file
 
examples/favorites/patrick_stewart_secret_of_life.mp3 ADDED
Binary file (36.5 kB). View file
 
examples/favorites/robert_deniro_review.mp3 ADDED
Binary file (36.1 kB). View file
 
examples/favorites/william_shatner_spacecraft_interview.mp3 ADDED
Binary file (47.3 kB). View file
 
examples/finetuned/lj/1.mp3 ADDED
Binary file (38.2 kB). View file
 
examples/finetuned/lj/2.mp3 ADDED
Binary file (26.1 kB). View file
 
examples/finetuned/lj/3.mp3 ADDED
Binary file (18.5 kB). View file
 
examples/finetuned/lj/4.mp3 ADDED
Binary file (22.9 kB). View file
 
examples/naturalspeech_comparison/fibers/naturalspeech.mp3 ADDED
Binary file (33.1 kB). View file
 
examples/naturalspeech_comparison/fibers/tortoise.mp3 ADDED
Binary file (33.5 kB). View file
 
examples/naturalspeech_comparison/lax/naturalspeech.mp3 ADDED
Binary file (41 kB). View file
 
examples/naturalspeech_comparison/lax/tortoise.mp3 ADDED
Binary file (42.3 kB). View file
 
examples/naturalspeech_comparison/maltby/naturalspeech.mp3 ADDED
Binary file (35.1 kB). View file
 
examples/naturalspeech_comparison/maltby/tortoise.mp3 ADDED
Binary file (36.9 kB). View file
 
examples/prompting/angry.mp3 ADDED
Binary file (7.63 kB). View file
 
examples/prompting/happy.mp3 ADDED
Binary file (8.3 kB). View file
 
examples/prompting/sad.mp3 ADDED
Binary file (6.29 kB). View file
 
examples/prompting/scared.mp3 ADDED
Binary file (6.86 kB). View file
 
examples/riding_hood/angelina.mp3 ADDED
Binary file (866 kB). View file
 
examples/riding_hood/craig.mp3 ADDED
Binary file (826 kB). View file
 
examples/riding_hood/deniro.mp3 ADDED
Binary file (851 kB). View file
 
examples/riding_hood/emma.mp3 ADDED
Binary file (807 kB). View file
 
examples/riding_hood/freeman.mp3 ADDED
Binary file (943 kB). View file
 
examples/riding_hood/geralt.mp3 ADDED
Binary file (788 kB). View file
 
examples/riding_hood/halle.mp3 ADDED
Binary file (785 kB). View file