Spaces:

darylfunggg
/

text-analysis

Running

App Files Files Community

text-analysis / keyphrase_extraction.py

Daryl Fung

added top 10

2a000a7 over 1 year ago

raw

history blame

No virus

6.29 kB

	from textblob import TextBlob
	import spacy
	from spacy import displacy
	import pandas as pd
	import seaborn as sns
	import textrank
	import matplotlib.pyplot as plt
	from pathlib import Path
	from nltk.tokenize import sent_tokenize, word_tokenize
	from nltk.corpus import stopwords
	from nltk.probability import FreqDist
	from nltk.tokenize import word_tokenize
	from nltk.stem import PorterStemmer
	from sklearn.metrics.pairwise import cosine_similarity
	import networkx as nx
	import matplotlib.pyplot as plt
	import numpy as np
	import pytextrank

	# Load the pre-trained NLP model
	nlp = spacy.load("en_core_web_sm")
	nlp.add_pipe('textrank')

	# Sample text to analyze
	text = """
	Database that collects, administrative, clinical and demographic information on hospital discharges (including deaths, sign-outs and transfers). Some provinces and territories also use the DAD to capture day surgery.
	The discharge abstract database is a database for information on all AHS separations for acute care institutions, including discharges, deaths, sign-outs and transfers.
	Data on discharges, transfers and deaths of in-patients and day surgery patients from acute care hospitals in BC. All Canadian hospitals (except those in Quebec) submit their separations records directly to the Canadian Institute of Health information (CIHI) for inclusion in the Discharge Abstract Database (DAD). The database contains demographic, administrative and clinical data for hospital discharges (inpatient acute, chronic, rehabilitation) and day surgeries. A provincial data set, including various CIHI value-added elements (such as case mix groups, and resource intensity weights) is released on a monthly basis to the respective Ministries of Health. The DAD data files which Population Data BC receives include the CIHI variables. Population Data BC receives these data once per year.
	Health data maintained by Manitoba Health consisting of hospital forms/computerized records containing summaries of demographic and clinical information (e.g., gender, postal code, diagnoses and procedure codes) completed at the point of discharge from the hospital. Several hundred thousand abstracts per year are submitted for all separations from acute and chronic care facilities in Manitoba and for all Manitobans admitted to out-of-province facilities. The Hospital Abstracts Data includes records of both Manitoba residents and non-Manitoba residents hospitalized in Manitoba facilities and information about inpatient and day surgery services.
	Patient discharge information from New Brunswick hospitals. Captures administrative, clinical and demographic information including discharges, deaths, sign-outs, and transfers.
	The Provincial Discharge Abstract Database (PDAD) is the NLCHI dataset that contains demographic, clinical and administrative data collected at hospitals when patients are discharged from inpatient and surgical day care services and submitted to the CIHI Discharge Abstract Database. The PDAD captures information regarding hospitalizations of both residents of NL and non-residents receiving care in NL.
	Contains information on each hospital admission recorded in a Nova Scotia hospital
	The Discharge Abstract Database is a database for information on all separation from acute care institutions within a fiscal year (April 1st to March 31st). Data is received directly from acute care facilities or from their respective health/regional authority or ministry/department of health.
	Captures administrative, clinical and demographic information on discharges for acute care facilities (including deaths, sign-outs and transfers).

	"""

	def get_top_key_phrases(text, top_n, save_output):
	# Process the text
	doc = nlp(text)
	# show the score of key phrases #
	phrases_ranking = {phrase.text: phrase.rank for phrase in doc._.phrases}
	phrases = list(zip(*phrases_ranking.items()))[0]
	scores = list(zip(*phrases_ranking.items()))[1]
	keyword_df = pd.DataFrame({'words': phrases[:top_n], 'scores': scores[:top_n]})
	plt.title("Word Count")
	plt.figure(figsize=(24, 8))
	plt.yticks(fontsize=15)
	sns.barplot(data=keyword_df, y='words', x='scores', palette='blend:#7AB,#EDA')
	plt.xlabel("scores", fontsize=15)
	plt.ylabel("words", fontsize=15)
	plt.savefig(save_output, dpi=300, bbox_inches="tight")
	plt.close()

	def visualize_textrank(text):
	# Get text
	# Generate TextRank
	tr = textrank.TextRank()
	tr.calculate_scores(text)

	# Get top 10 words
	words = [w for w, s in tr.top_words(10)]

	# Create graph
	G = nx.Graph()

	# Add nodes
	for w in words:
	G.add_node(w)

	# Find co-occurrence counts
	counts = {}
	for i in range(len(words) - 1):
	w1 = words[i]
	w2 = words[i + 1]
	key = (w1, w2)
	counts[key] = counts.get(key, 0) + 1

	# Add edges with weights
	for key, count in counts.items():
	w1, w2 = key
	G.add_edge(w1, w2, weight=count)

	# Draw graph with weighted edges
	nx.draw(G, with_labels=True, width=list(e[2]['weight'] for e in G.edges()))
	plt.show()


	def display_key_phrases(text, save_output):
	text = text.replace('\n', ' \n')
	doc = nlp(text)
	key_phrases = [{'start': chunk.start_char, 'end': chunk.end_char, 'label': str(round(phrase.rank, 2))} for phrase in doc._.phrases for chunk in phrase.chunks]

	# generate displacy html #
	max_rank = float(key_phrases[0]['label'])
	min_rank = float(key_phrases[-1]['label'])
	step = 50/max_rank
	colors = {str(phrase['label']): f'hsl(121, 100%, {100-float(phrase["label"])*step}%)' for phrase in key_phrases}
	options = {'ents': [color for color in colors.keys()], 'colors': colors}
	# Create a list of spans to highlight
	sentence = [
	{'text': text,
	'ents': key_phrases,
	'title': None}
	]

	# Create a visualization of the text with highlighted key phrases
	svg = displacy.render(sentence, style="ent", options=options, manual=True, page=True)
	filename = Path(save_output)
	filename.open('w', encoding='utf-8').write(svg)
	return svg


	if __name__ == '__main__':
	visualize_textrank(text)
	# get_top_key_phrases(text, 10, 'test_results/keyphrase.png')
	# display_key_phrases(text)