# # This is example for parsing XML input for PC204 Challenge 2. # We provide working code for querying the EUtils ESearch service # for publications of an author, parsing the XML output, and # returning a list of PubMed IDs as strings. # # For Challenge 2, you will also need to query the EUtils # ESummary service for details corresponding to PubMed IDs # (in particular, the author list). You will need to # construct the query data and parse the XML output, just # like 'get_pubs_for_author' below. Constructing the # query should be straightforward but parsing the output # requires a bit of work. # # We suggest you implement a function 'get_pubs_for_ids' # which takes a list of PubMed IDs (as strings) and returns # a map whose keys are PubMed IDs and whose values are # lists of author names. It will send a single ESummary # query and extract the author lists for all publications # from the XML results. # # - ESummary has 10,000 id limit for PubMed queries, so you # may want to handle that in your function. # - When processing many PubMed IDs, do NOT send one query # per publication. That is slow on the query side and # places an unnecessary load on the EUtils servers. # # To parse ESummary results, you will need to implement # a class similar to 'IDHandler' below, but it will need to # deal with a few more XML tags. The full ESummary XML format # is somewhat complicated, but we can treat the output as # essentially something like: # # ... stuff we do not care about ... # # ... stuff we do not care about ... # PubMed ID # ... stuff we do not care about ... # Author name # ... more Item elements for authors ... # ... stuff we do not care about ... # # ... more DocSum elements with same structure ... # # You may want to start with the 'IDHandler' code and change # it handle the extra tags. (For checking the 'Name' attribute # in an '' element, you probably want something like: # # def startElement(self, name, attrs): # ... # if name == "Item" and attrs["Name"] == "Author": # ... # # Hint: You will probably want to use the '' element # opening and closing tags as markers for resetting and saving # the data (PubMed ID and author list) for the current publication. # # For those who prefer full documentation, start with # https://docs.python.org/2/library/xml.sax.html # which describes how the Python xml.sax module works. # # --------------------------------------------------------------------------- # # ESearch URL for Entrez EUtils (http://www.ncbi.nlm.nih.gov/books/NBK25497/) # SearchURL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" def get_pubs_for_author(author): """Return list of PubMed ids for an author. 'author' should be of form 'Surname Initials' where 'Initials' is a single string consisting of the first letter of all given names.""" # Import the URL functions we will use try: # Python 2 from urllib import urlopen, urlencode except ImportError: # Python 3 from urllib.request import urlopen from urllib.parse import urlencode data = ( ("db", "pubmed"), ("usehistory", "n"), ("retmax", "100000"), ("term", "%s[Author]" % author), ) # Connect to EUtils service. 'f' is a file-like # object from which we will read the XML results. f = urlopen(SearchURL, urlencode(data).encode("ascii")) # Create our custom XML handler for extracting PubMed IDs id_extractor = IDHandler() # Read the XML file and return the extracted list of PubMed IDs from xml.sax import parse parse(f, id_extractor) return id_extractor.id_list # # The way XML parsing works is that we call # 'xml.sax.parse' with the input XML stream # and a 'handler' object. 'parse' reads the # characters from the input stream and calls # handler methods at opportune times with the # relevant data. For example, when an opening # tag such as '' is read, 'parse' calls the # handler 'startElement' method with the name of # tag ('Id') and a map of attribute values (empty # in this example). # # To read our specific XML data, eg ESearch # results, we pass in a customized handler that # squirrels away the text between '' and # '' tags. To do this, we create a subclass # of the standard 'xml.sax.handler.ContentHandler' # class and override some methods that are called # by 'xml.sax.parse'. # # In fact, we create two subclasses: TextGrabber # and IDHandler. TextGrabber provides methods # for grabbing text during XML processing and # is intended to be further subclassed for # specific XML formats. IDHandler uses TextGrabber # to save text of '' elements in a list. # from xml.sax.handler import ContentHandler class TextGrabber(ContentHandler): """Class for grabbing text from XML files.""" def initGrab(self): """Initialize text-grabbing state. Normally, this function is called from 'startDocument'.""" self.grabbing = False def startGrab(self): """Start saving text from XML files. Any previously saved text is discarded.""" self.grabbing = True self.grab_content = "" def endGrab(self): """Stop saving text. Returns text saved since the last call to 'startGrab'.""" self.grabbing = False return self.grab_content # Override 'ContentHandler' method. def characters(self, content): # Save characters during a grab. if self.grabbing: self.grab_content += content class IDHandler(TextGrabber): """Class for extracting PubMed IDs from ESearch XML results.""" # Override 'ContentHandler' method. def startDocument(self): # Initialize 'TextGrabber' state. self.initGrab() # We are not currently grabbing a PubMed ID. (see below) self.id_grab = False # Set list of saved PubMed IDs to empty list. self.id_list = [] # Override 'ContentHandler' method. def startElement(self, name, attrs): # PubMed IDs are the text within 'Id' tags in # ESearch output, so we start grabbing IDs # whenever we see an opening tag "" if name == "Id": self.id_grab = True self.startGrab() # Override 'ContentHandler' method. def endElement(self, name): # When we see a closing tag "", we # check if we are grabbing an Id (we _should_ # be, but it does not hurt to be sure). # If we are, terminate the grab and add the # saved text to our PubMed ID list. if name == "Id" and self.id_grab: self.id_list.append(self.endGrab()) self.id_grab = False if __name__ == "__main__": # Basic test program import pprint id_list = get_pubs_for_author("EL-SAMAD H") pprint.pprint(id_list)