Generate Sentences

Download Raw Clone


  1. # Place the script in a folder with the package.zip data dump file or unzipped folder.
  2. # The script generates a messages.txt file of all of your messages; the corpus
  3. # and a generated_messages.txt file for all the results.
  4. # To run do `python3.6 -m pip install markovify spacy`, spacy is optional
  5. # and then `python3.6 -m spacy dowload en`, to download the model
  6. import zipfile
  7. import os
  8. import csv
  9. import markovify
  10. try:
  11. import spacy
  12. except (ImportError, OSError):
  13. spacy = None
  14. nlp = None
  15. else:
  16. try:
  17. nlp = spacy.load("en")
  18. except OSError:
  19. nlp = None
  20. import time
  21. data_name = "package"
  22. messages_file = "messages.csv"
  23. timeout = 60 * 30 # wait 30 minutes before beginning to generate the corpus
  24. def main():
  25. unzip()
  26. messages = get_messages()
  27. with open("messages.txt", "w+", encoding="utf-8") as f:
  28. f.truncate(0)
  29. f.write(messages)
  30. sentence_count = input("Sentences to generate: ")
  31. try:
  32. sentence_count = int(sentence_count)
  33. except ValueError:
  34. print("The number of sentences must be an integer")
  35. return
  36. if sentence_count <= 0:
  37. print("Can't generate less than one sentence.")
  38. return
  39. start = time.perf_counter()
  40. if spacy is not None:
  41. if nlp is None:
  42. use_spacy = False
  43. print("Can't use Spacy; language model has not been installed.")
  44. else:
  45. use_spacy_input = input("Use Spacy [Y/n]: ")
  46. use_spacy = None
  47. yes_options = ("yes", "y", "")
  48. no_options = ("no", "n")
  49. if use_spacy_input.casefold() in yes_options:
  50. use_spacy = True
  51. elif use_spacy_input.casefold() in no_options:
  52. use_spacy = False
  53. else:
  54. use_spacy = False
  55. print("Can't use Spacy; module has not been installed.")
  56. if use_spacy is None:
  57. print("Invalid option")
  58. return
  59. text_model = create_model(messages, use_spacy)
  60. result = generate_sentences(text_model, sentence_count)
  61. sentence_generation_time = time.perf_counter() - start
  62. print(
  63. f"Took {sentence_generation_time} seconds "
  64. f"to generate {sentence_count} sentences."
  65. )
  66. write_start = time.perf_counter()
  67. write_text(result)
  68. elapsed = time.perf_counter()
  69. write_elapsed = elapsed - write_start
  70. total_elapsed = elapsed - start
  71. print(
  72. f"Took {write_elapsed} seconds to write {sentence_count} sentences.\n"
  73. f"Took {total_elapsed} seconds total."
  74. )
  75. def unzip():
  76. try:
  77. with zipfile.ZipFile(f"{data_name}.zip", "r") as f:
  78. f.extractall(data_name)
  79. except FileNotFoundError:
  80. pass
  81. def get_messages():
  82. messages = ""
  83. guilds = os.walk(f"{data_name}/messages")
  84. next(guilds) # skip root dir
  85. for root, _, files in guilds:
  86. if messages_file in files:
  87. try:
  88. with open(os.path.join(root, messages_file), encoding="utf-8") as f:
  89. message_reader = csv.reader(f)
  90. next(message_reader) # skip titles
  91. for line in message_reader:
  92. if len(line) > 2:
  93. messages += line[2]
  94. messages += "\n"
  95. except FileNotFoundError:
  96. print(f"{root} does not have {messages_file}.")
  97. return messages
  98. class POSifiedText(markovify.NewlineText):
  99. def word_split(self, sentence):
  100. return ["::".join((word.orth_, word.pos_)) for word in nlp(sentence)]
  101. def word_join(self, words):
  102. return " ".join(word.split("::")[0] for word in words)
  103. def create_model(messages, use_spacy: bool = False):
  104. if use_spacy is True:
  105. return POSifiedText(messages)
  106. else:
  107. return markovify.NewlineText(messages)
  108. def generate_sentences(model, sentence_count):
  109. result = []
  110. for i in range(sentence_count):
  111. # Sentence generation has built in retries for cases like too similar to corpus.
  112. # However it may still fail and return None
  113. sentence = model.make_sentence()
  114. if sentence is not None:
  115. result.append(sentence)
  116. return result
  117. def write_text(text):
  118. with open("generated_sentences.txt", "w+", encoding="utf-8") as f:
  119. f.truncate(0)
  120. f.write("\n".join(text))
  121. if __name__ == "__main__":
  122. main()

Raw paste data: