Skip to content

Reducing RAM footprint + Adding preferred term output #67

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions quickumls/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .core import QuickUMLS
from .client import get_quickumls_client
from .about import *
from .client import get_quickumls_client
from .core import QuickUMLS
12 changes: 6 additions & 6 deletions quickumls/about.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
# https://github.com/explosion/spaCy/blob/master/spacy/about.py

__title__ = 'quickumls'
__version__ = '1.4.0r1'
__author__ = 'Luca Soldaini'
__email__ = '[email protected]'
__license__ = 'MIT'
__title__ = "quickumls"
__version__ = "1.4.0r1"
__author__ = "Luca Soldaini"
__email__ = "[email protected]"
__license__ = "MIT"
__uri__ = "https://github.com/Georgetown-IR-Lab/QuickUMLS"
__copyright__ = '2014-2020, Georgetown University Information Retrieval Lab'
__copyright__ = "2014-2020, Georgetown University Information Retrieval Lab"
6 changes: 3 additions & 3 deletions quickumls/client.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from .network import MinimalClient
from .core import QuickUMLS
from .network import MinimalClient


def get_quickumls_client(host='localhost', port=4645):
'''Return a client for a QuickUMLS server running on host at port'''
def get_quickumls_client(host="localhost", port=4645):
"""Return a client for a QuickUMLS server running on host at port"""
client = MinimalClient(QuickUMLS, host=host, port=port, buffersize=4096)
return client
173 changes: 105 additions & 68 deletions quickumls/constants.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,28 @@
HEADERS_MRCONSO = [
'cui', 'lat', 'ts', 'lui', 'stt', 'sui', 'ispref', 'aui', 'saui',
'scui', 'sdui', 'sab', 'tty', 'code', 'str', 'srl', 'suppress', 'cvf'
]
HEADERS_MRSTY = [
'cui', 'sty', 'hier' 'desc', 'sid', 'num'
"cui",
"lat",
"ts",
"lui",
"stt",
"sui",
"ispref",
"aui",
"saui",
"scui",
"sdui",
"sab",
"tty",
"code",
"str",
"srl",
"suppress",
"cvf",
]
HEADERS_MRSTY = ["cui", "sty", "hier", "desc", "sid", "num"]

NEGATIONS = {'none', 'non', 'neither', 'nor', 'no', 'not'}
NEGATIONS = {"none", "non", "neither", "nor", "no", "not"}

# The following is a list of all existing semtypes along with their name and some examples.
# The following is a list of all existing semtypes along with their name and some examples.
# You can easily select the ones you need by commenting out the lines that are not relevant for your application.

ACCEPTED_SEMTYPES = {
Expand All @@ -21,7 +35,7 @@
# 'T190', # Anatomical Abnormality, ex.: Bronchial Fistula; Foot Deformities; Hyperostosis of skull
# 'T017', # Anatomical Structure, ex.: Cadaver; Pharyngostome; Anatomic structures
# 'T008', # Animal, ex.: Animals; Animals, Laboratory; Carnivore
'T195', # Antibiotic, ex.: Antibiotics; bactericide; Thienamycins
"T195", # Antibiotic, ex.: Antibiotics; bactericide; Thienamycins
# 'T194', # Archaeon, ex.: Thermoproteales; Haloferax volcanii; Methanospirillum
# 'T007', # Bacterium, ex.: Acetobacter; Bacillus cereus; Cytophaga
# 'T053', # Behavior, ex.: Homing Behavior; Sexuality; Habitat Selection
Expand All @@ -30,10 +44,10 @@
# 'T091', # Biomedical Occupation or Discipline, ex.: Adolescent Medicine; Cellular Neurobiology; Dentistry
# 'T122', # Biomedical or Dental Material, ex.: Acrylic Resins; Bone Cements; Dentifrices
# 'T012', # Bird, ex.: Serinus; Ducks; Quail
'T029', # Body Location or Region, ex.: Forehead; Sublingual Region; Base of skull structure
'T023', # Body Part, Organ, or Organ Component, ex.: Aorta; Brain Stem; Structure of neck of femur
"T029", # Body Location or Region, ex.: Forehead; Sublingual Region; Base of skull structure
"T023", # Body Part, Organ, or Organ Component, ex.: Aorta; Brain Stem; Structure of neck of femur
# 'T030', # Body Space or Junction, ex.: Knee joint; Greater sac of peritoneum; Synapses
'T031', # Body Substance, ex.: Amniotic Fluid; saliva; Smegma
"T031", # Body Substance, ex.: Amniotic Fluid; saliva; Smegma
# 'T022', # Body System, ex.: Endocrine system; Renin-angiotensin system; Reticuloendothelial System
# 'T088', # Carbohydrate Sequence, ex.: Carbohydrate Sequence; Abnormal carbohydrate sequence
# 'T025', # Cell, ex.: B-Lymphocytes; Dendritic Cells; Fibroblasts
Expand All @@ -44,14 +58,14 @@
# 'T120', # Chemical Viewed Functionally, ex.: Aerosol Propellants; Detergents; Stabilizing Agents
# 'T104', # Chemical Viewed Structurally, ex.: Ammonium Compounds; Cations; Sulfur Compounds
# 'T185', # Classification, ex.: Anatomy (MeSH Category); Tumor Stage Classification; axis i
'T201', # Clinical Attribute, ex.: Bone Density; heart rate; Range of Motion, Articular
'T200', # Clinical Drug, ex.: Ranitidine 300 MG Oral Tablet [Zantac]; Aspirin 300 MG Delayed Release Oral
"T201", # Clinical Attribute, ex.: Bone Density; heart rate; Range of Motion, Articular
"T200", # Clinical Drug, ex.: Ranitidine 300 MG Oral Tablet [Zantac]; Aspirin 300 MG Delayed Release Oral
# 'T077', # Conceptual Entity, ex.: Geographic Factors; Fractals; Secularism
# 'T019', # Congenital Abnormality, ex.: Albinism; Cleft palate with cleft lip; Polydactyly of toes
# 'T056', # Daily or Recreational Activity, ex.: Badminton; Dancing; Swimming
'T060', # Diagnostic Procedure, ex.: Biopsy; Heart Auscultation; Magnetic Resonance Imaging
'T047', # Disease or Syndrome, ex.: Diabetes Mellitus; Drug Allergy; Malabsorption Syndrome
'T203', # Drug Delivery Device, ex.: Nordette 21 Day Pack; {7 (Terazosin 1 MG Oral Tablet) / 7 (Terazosin 2 MG
"T060", # Diagnostic Procedure, ex.: Biopsy; Heart Auscultation; Magnetic Resonance Imaging
"T047", # Disease or Syndrome, ex.: Diabetes Mellitus; Drug Allergy; Malabsorption Syndrome
"T203", # Drug Delivery Device, ex.: Nordette 21 Day Pack; {7 (Terazosin 1 MG Oral Tablet) / 7 (Terazosin 2 MG
# 'T065', # Educational Activity, ex.: Academic Training; Family Planning Training; Preceptorship
# 'T196', # Element, Ion, or Isotope, ex.: Carbon; Chromium Isotopes; Radioisotopes
# 'T018', # Embryonic Structure, ex.: Blastoderm; Fetus; Neural Crest
Expand All @@ -62,7 +76,7 @@
# 'T051', # Event, ex.: Anniversaries; Exposure to Mumps virus (event); Device Unattended
# 'T050', # Experimental Model of Disease, ex.: Alloxan Diabetes; Liver Cirrhosis, Experimental; Transient Gene Knock-Out
# 'T099', # Family Group, ex.: Daughter; Is an only child; Unmarried Fathers
'T033', # Finding, ex.: Birth History; Downward displacement of diaphragm; Decreased glucose level
"T033", # Finding, ex.: Birth History; Downward displacement of diaphragm; Decreased glucose level
# 'T013', # Fish, ex.: Bass; Salmonidae; Whitefish
# 'T168', # Food, ex.: Beverages; Egg Yolk (Dietary); Ice Cream
# 'T021', # Fully Formed Anatomical Structure, ex.: Entire body as a whole; Female human body; Set of parts of human body
Expand All @@ -75,32 +89,32 @@
# 'T096', # Group, ex.: Focus Groups; jury; teams
# 'T102', # Group Attribute, ex.: Family Size; Group Structure; Life Expectancy
# 'T131', # Hazardous or Poisonous Substance, ex.: Carcinogens; Fumigant; Mutagens
'T058', # Health Care Activity, ex.: ambulatory care services; Clinic Activities; Preventive Health Services
"T058", # Health Care Activity, ex.: ambulatory care services; Clinic Activities; Preventive Health Services
# 'T093', # Health Care Related Organization, ex.: Centers for Disease Control and Prevention (U.S.); Halfway Houses;
# 'T125', # Hormone, ex.: Enteric Hormones; thymic humoral factor; Prohormone
# 'T016', # Human, ex.: Homo sapiens; jean piaget; Member of public
# 'T068', # Human-caused Phenomenon or Process, ex.: Baby Boom; Cultural Evolution; Mass Media
# 'T078', # Idea or Concept, ex.: Capitalism; Civil Rights; Ethics
# 'T129', # Immunologic Factor, ex.: Antigens; Immunologic Factors; Blood group antigen P
'T130', # Indicator, Reagent, or Diagnostic Aid, ex.: Fluorescent Dyes; Indicators and Reagents; India ink stain
"T130", # Indicator, Reagent, or Diagnostic Aid, ex.: Fluorescent Dyes; Indicators and Reagents; India ink stain
# 'T055', # Individual Behavior, ex.: Assertiveness; Grooming; Risk-Taking
'T037', # Injury or Poisoning, ex.: Accidental Falls; Carbon Monoxide Poisoning; Snake Bites
"T037", # Injury or Poisoning, ex.: Accidental Falls; Carbon Monoxide Poisoning; Snake Bites
# 'T197', # Inorganic Chemical, ex.: Carbonic Acid; aluminum nitride; ferric citrate
'T170', # Intellectual Product, ex.: Decision Support Techniques; Information Systems; Literature
'T034', # Laboratory or Test Result, ex.: Blood Flow Velocity; Serum Calcium Level; Spinal Fluid Pressure
'T059', # Laboratory Procedure, ex.: Blood Protein Electrophoresis; Crystallography; Radioimmunoassay
"T170", # Intellectual Product, ex.: Decision Support Techniques; Information Systems; Literature
"T034", # Laboratory or Test Result, ex.: Blood Flow Velocity; Serum Calcium Level; Spinal Fluid Pressure
"T059", # Laboratory Procedure, ex.: Blood Protein Electrophoresis; Crystallography; Radioimmunoassay
# 'T171', # Language, ex.: Armenian language; braille; Bilingualism
# 'T066', # Machine Activity, ex.: Computer Simulation; Equipment Failure; Natural Language Processing
# 'T015', # Mammal, ex.: Ursidae Family; Hamsters; Macaca
# 'T073', # Manufactured Object, ex.: car seat; Cooking and Eating Utensils; Goggles
'T074', # Medical Device, ex.: Bone Screws; Headgear, Orthodontic; Compression Stockings
'T048', # Mental or Behavioral Dysfunction, ex.: Agoraphobia; Cyclothymic Disorder; Frigidity
'T041', # Mental Process, ex.: Anger; Auditory Fatigue; Avoidance Learning
"T074", # Medical Device, ex.: Bone Screws; Headgear, Orthodontic; Compression Stockings
"T048", # Mental or Behavioral Dysfunction, ex.: Agoraphobia; Cyclothymic Disorder; Frigidity
"T041", # Mental Process, ex.: Anger; Auditory Fatigue; Avoidance Learning
# 'T063', # Molecular Biology Research Technique, ex.: Northern Blotting; Genetic Engineering; In Situ Hybridization
# 'T044', # Molecular Function, ex.: Binding, Competitive; Electron Transport; Glycolysis
# 'T085', # Molecular Sequence, ex.: Genetic Code; Homologous Sequences; Molecular Sequence
# 'T070', # Natural Phenomenon or Process, ex.: Air Movements; Corrosion; Lightning (phenomenon)
'T191', # Neoplastic Process, ex.: Abdominal Neoplasms; Bowen's Disease; Polyp in nasopharynx
"T191", # Neoplastic Process, ex.: Abdominal Neoplasms; Bowen's Disease; Polyp in nasopharynx
# 'T114', # Nucleic Acid, Nucleoside, or Nucleotide, ex.: Cytosine Nucleotides; Guanine; Oligonucleotides
# 'T086', # Nucleotide Sequence, ex.: Base Sequence; Direct Repeat; RNA Sequence
# 'T090', # Occupation or Discipline, ex.: Aviation; Craniology; Ecology
Expand All @@ -109,14 +123,14 @@
# 'T109', # Organic Chemical, ex.: Benzene Derivatives
# 'T001', # Organism, ex.: Organism; Infectious agent; Heterotroph
# 'T032', # Organism Attribute, ex.: Age; Birth Weight; Eye Color
'T040', # Organism Function, ex.: Breeding; Hibernation; Motor Skills
"T040", # Organism Function, ex.: Breeding; Hibernation; Motor Skills
# 'T092', # Organization, ex.: Labor Unions; United Nations; Boarding school
'T046', # Pathologic Function, ex.: Inflammation; Shock; Thrombosis
"T046", # Pathologic Function, ex.: Inflammation; Shock; Thrombosis
# 'T101', # Patient or Disabled Group, ex.: Amputees; Institutionalized Child; Mentally Ill Persons
'T121', # Pharmacologic Substance, ex.: Antiemetics; Cardiovascular Agents; Alka-Seltzer
'T067', # Phenomenon or Process, ex.: Disasters; Motor Traffic Accidents; Depolymerization
"T121", # Pharmacologic Substance, ex.: Antiemetics; Cardiovascular Agents; Alka-Seltzer
"T067", # Phenomenon or Process, ex.: Disasters; Motor Traffic Accidents; Depolymerization
# 'T072', # Physical Object, ex.: Printed Media; Meteors; Physical object
'T039', # Physiologic Function, ex.: Biorhythms; Hearing; Vasodilation
"T039", # Physiologic Function, ex.: Biorhythms; Hearing; Vasodilation
# 'T002', # Plant, ex.: Aloe; Pollen; Helianthus species
# 'T098', # Population Group, ex.: Asian Americans; Ethnic group; Adult Offenders
# 'T097', # Professional or Occupational Group, ex.: Clergy; Demographers; Hospital Volunteers
Expand All @@ -129,60 +143,83 @@
# 'T062', # Research Activity, ex.: Animal Experimentation; Biomedical Research; Experimental Replication
# 'T075', # Research Device, ex.: Electrodes, Enzyme; DNA Microarray Chip; Particle Count and Size Analyzer
# 'T095', # Self-help or Relief Organization, ex.: Alcoholics Anonymous; Charities - organization; Red Cross
'T184', # Sign or Symptom, ex.: Dyspnea; Nausea; Pain
"T184", # Sign or Symptom, ex.: Dyspnea; Nausea; Pain
# 'T054', # Social Behavior, ex.: Acculturation; Communication; Interpersonal Relations
# 'T082', # Spatial Concept, ex.: Mandibular Rest Position; Lateral; Extrinsic
# 'T167', # Substance, ex.: Air (substance); Fossils; Plastics
# 'T079', # Temporal Concept, ex.: Birth Intervals; Half-Life; Postoperative Period
'T061', # Therapeutic or Preventive Procedure, ex.: Cesarean section; Dermabrasion; Family psychotherapy
"T061", # Therapeutic or Preventive Procedure, ex.: Cesarean section; Dermabrasion; Family psychotherapy
# 'T024', # Tissue, ex.: Cartilage; Endothelium; Epidermis
# 'T010', # Vertebrate, ex.: Vertebrates; Gnathostomata vertebrate; Craniata <chordata>
# 'T005', # Virus, ex.: Coliphages; Echovirus; Parvoviridae
# 'T127' # Vitamin, ex.: 5,25-Dihydroxy cholecalciferol; alpha-tocopheryl oxalate; Vitamin A [EPC]
}

UNICODE_DASHES = {
u'\u002d', u'\u007e', u'\u00ad', u'\u058a', u'\u05be', u'\u1400',
u'\u1806', u'\u2010', u'\u2011', u'\u2010', u'\u2012', u'\u2013',
u'\u2014', u'\u2015', u'\u2053', u'\u207b', u'\u2212', u'\u208b',
u'\u2212', u'\u2212', u'\u2e17', u'\u2e3a', u'\u2e3b', u'\u301c',
u'\u3030', u'\u30a0', u'\ufe31', u'\ufe32', u'\ufe58', u'\ufe63',
u'\uff0d'
u"\u002d",
u"\u007e",
u"\u00ad",
u"\u058a",
u"\u05be",
u"\u1400",
u"\u1806",
u"\u2010",
u"\u2011",
u"\u2010",
u"\u2012",
u"\u2013",
u"\u2014",
u"\u2015",
u"\u2053",
u"\u207b",
u"\u2212",
u"\u208b",
u"\u2212",
u"\u2212",
u"\u2e17",
u"\u2e3a",
u"\u2e3b",
u"\u301c",
u"\u3030",
u"\u30a0",
u"\ufe31",
u"\ufe32",
u"\ufe58",
u"\ufe63",
u"\uff0d",
}

# language with missing value
# will not have support for tokenization
LANGUAGES = {
'BAQ': None, # Basque
'CHI': None, # Chinese
'CZE': None, # Czech
'DAN': 'danish', # Danish
'DUT': 'dutch', # Dutch
'ENG': 'english', # English
'EST': None, # Estonian
'FIN': 'finnish', # Finnish
'FRE': 'french', # French
'GER': 'german', # German
'GRE': 'greek', # Greek
'HEB': None, # Hebrew
'HUN': 'hungarian', # Hungarian
'ITA': 'italian', # Italian
'JPN': None, # Japanese
'KOR': None, # Korean
'LAV': None, # Latvian
'NOR': 'norwegian', # Norwegian
'POL': 'polish', # Polish
'POR': 'portoguese', # Portuguese
'RUS': 'russian', # Russian
'SCR': None, # Croatian
'SPA': 'spanish', # Spanish
'SWE': 'swedish', # Swedish
'TUR': 'turkish', # Turkish
"BAQ": None, # Basque
"CHI": None, # Chinese
"CZE": None, # Czech
"DAN": "danish", # Danish
"DUT": "dutch", # Dutch
"ENG": "english", # English
"EST": None, # Estonian
"FIN": "finnish", # Finnish
"FRE": "french", # French
"GER": "german", # German
"GRE": "greek", # Greek
"HEB": None, # Hebrew
"HUN": "hungarian", # Hungarian
"ITA": "italian", # Italian
"JPN": None, # Japanese
"KOR": None, # Korean
"LAV": None, # Latvian
"NOR": "norwegian", # Norwegian
"POL": "polish", # Polish
"POR": "portoguese", # Portuguese
"RUS": "russian", # Russian
"SCR": None, # Croatian
"SPA": "spanish", # Spanish
"SWE": "swedish", # Swedish
"TUR": "turkish", # Turkish
}

DOMAIN_SPECIFIC_STOPWORDS = {
'time'
}
DOMAIN_SPECIFIC_STOPWORDS = {"time"}

SPACY_LANGUAGE_MAP = {
'ENG': 'en_core_web_sm',
Expand Down
Loading