Python解析xml文档实战案例

xml文档

<?xml version="1.0" ?>

<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2019//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd">

<PubmedArticleSet>

<PubmedArticle>

    <MedlineCitation Status="MEDLINE" Owner="NLM">

        <PMID Version="1">28901317</PMID>

        <DateCompleted>

            <Year>2018</Year>

            <Month>05</Month>

            <Day>10</Day>

        </DateCompleted>

        <DateRevised>

            <Year>2018</Year>

            <Month>12</Month>

            <Day>02</Day>

        </DateRevised>

        <Article PubModel="Print">

            <Journal>

                <ISSN IssnType="Electronic">1998-4138</ISSN>

                <JournalIssue CitedMedium="Internet">

                    <Volume>13</Volume>

                    <Issue>4</Issue>

                    <PubDate>

                        <Year>2017</Year>

                    </PubDate>

                </JournalIssue>

                <Title>Journal of cancer research and therapeutics</Title>

                <ISOAbbreviation>J Cancer Res Ther</ISOAbbreviation>

            </Journal>

            <ArticleTitle><i>k-RAS</i> mutation and resistance to epidermal growth factor receptor-tyrosine kinase inhibitor treatment in patients with nonsmall cell lung cancer.</ArticleTitle>

            <Pagination>

                <MedlinePgn>699-701</MedlinePgn>

            </Pagination>

            <ELocationID EIdType="doi" ValidYN="Y">10.4103/jcrt.JCRT_468_17</ELocationID>

            <Abstract>

                <AbstractText Label="OBJECTIVE" NlmCategory="OBJECTIVE">The aim of this study was to evaluate the relationship between k-RAS gene mutation and the resistance to epidermal growth factor receptor-tyrosine kinase inhibitor (EGFR-TKI) treatment in patients with nonsmall-cell lung cancer (NSCLC).</AbstractText>

                <AbstractText Label="METHODS" NlmCategory="METHODS">Forty-five pathologies confirmed NSCLC patients who received EGFR-TKI (Gefitinib) treatment were retrospectively included in this study. The mutation of codon 12 and 13, located in exon1 and exon 2 of k-RAS gene were examined by polymerase chain reaction (PCR) and DAN sequencing in tumor samples of the included 45 NSCLC patients. The correlation between Gefitinib treatment response and k-RAS mutation status was analyzed in tumor samples of the 45 NSCLC patients.</AbstractText>

                <AbstractText Label="RESULTS" NlmCategory="RESULTS">Eight tumor samples of the 45 NSCLC patients were found to be mutated in coden 12 or 13, with an mutation rate of 17.8% (8/45); the objective response rate (ORR) was 29.7%(11/37) with 1 cases of complete response (CR) and 10 cases of partial response in k-RAS mutation negative patients. Furthermore, the ORR was 0.0% in k-RAS mutation positive patients with none CR. The ORR between k-RAS mutation and nonmutation patients were significant different (P < 0.05).</AbstractText>

                <AbstractText Label="CONCLUSION" NlmCategory="CONCLUSIONS">k-RAS gene mutation status was associated with the response of Gefitinib treatment in patients with NSCLC.</AbstractText>

            </Abstract>

            <AuthorList CompleteYN="Y">

                <Author ValidYN="Y">

                    <LastName>Zhou</LastName>

                    <ForeName>Bin</ForeName>

                    <Initials>B</Initials>

                    <AffiliationInfo>

                        <Affiliation>Department of Pharmacy, The First Affiliated Hospital of Wenzhou Medical University, Wenzhou, Zhejiang, Province 325200, PR China.</Affiliation>

                    </AffiliationInfo>

                </Author>

                <Author ValidYN="Y">

                    <LastName>Tang</LastName>

                    <ForeName>Congrong</ForeName>

                    <Initials>C</Initials>

                    <AffiliationInfo>

                        <Affiliation>Department of Pharmacy, The First Affiliated Hospital of Wenzhou Medical University, Wenzhou, Zhejiang, Province 325200, PR China.</Affiliation>

                    </AffiliationInfo>

                </Author>

                <Author ValidYN="Y">

                    <LastName>Li</LastName>

                    <ForeName>Jie</ForeName>

                    <Initials>J</Initials>

                    <AffiliationInfo>

                        <Affiliation>Department of Pharmacy, Ruian People's Hospital, Ruian, Zhejiang, Province 325200, PR China.</Affiliation>

                    </AffiliationInfo>

                </Author>

            </AuthorList>

            <Language>eng</Language>

            <PublicationTypeList>

                <PublicationType UI="D016428">Journal Article</PublicationType>

            </PublicationTypeList>

        </Article>

        <MedlineJournalInfo>

            <Country>India</Country>

            <MedlineTA>J Cancer Res Ther</MedlineTA>

            <NlmUniqueID>101249598</NlmUniqueID>

            <ISSNLinking>1998-4138</ISSNLinking>

        </MedlineJournalInfo>

        <ChemicalList>

            <Chemical>

                <RegistryNumber>0</RegistryNumber>

                <NameOfSubstance UI="C117307">KRAS protein, human</NameOfSubstance>

            </Chemical>

            <Chemical>

                <RegistryNumber>0</RegistryNumber>

                <NameOfSubstance UI="D047428">Protein Kinase Inhibitors</NameOfSubstance>

            </Chemical>

            <Chemical>

                <RegistryNumber>0</RegistryNumber>

                <NameOfSubstance UI="D011799">Quinazolines</NameOfSubstance>

            </Chemical>

            <Chemical>

                <RegistryNumber>EC 2.7.10.1</RegistryNumber>

                <NameOfSubstance UI="C512478">EGFR protein, human</NameOfSubstance>

            </Chemical>

            <Chemical>

                <RegistryNumber>EC 2.7.10.1</RegistryNumber>

                <NameOfSubstance UI="D066246">ErbB Receptors</NameOfSubstance>

            </Chemical>

            <Chemical>

                <RegistryNumber>EC 3.6.5.2</RegistryNumber>

                <NameOfSubstance UI="D016283">Proto-Oncogene Proteins p21(ras)</NameOfSubstance>

            </Chemical>

            <Chemical>

                <RegistryNumber>S65743JHBS</RegistryNumber>

                <NameOfSubstance UI="D000077156">Gefitinib</NameOfSubstance>

            </Chemical>

        </ChemicalList>

        <CitationSubset>IM</CitationSubset>

        <MeshHeadingList>

            <MeshHeading>

                <DescriptorName UI="D000328" MajorTopicYN="N">Adult</DescriptorName>

            </MeshHeading>

            <MeshHeading>

                <DescriptorName UI="D000368" MajorTopicYN="N">Aged</DescriptorName>

            </MeshHeading>

            <MeshHeading>

                <DescriptorName UI="D002289" MajorTopicYN="N">Carcinoma, Non-Small-Cell Lung</DescriptorName>

                <QualifierName UI="Q000188" MajorTopicYN="Y">drug therapy</QualifierName>

                <QualifierName UI="Q000235" MajorTopicYN="N">genetics</QualifierName>

                <QualifierName UI="Q000473" MajorTopicYN="N">pathology</QualifierName>

            </MeshHeading>

            <MeshHeading>

                <DescriptorName UI="D019008" MajorTopicYN="N">Drug Resistance, Neoplasm</DescriptorName>

            </MeshHeading>

            <MeshHeading>

                <DescriptorName UI="D066246" MajorTopicYN="N">ErbB Receptors</DescriptorName>

                <QualifierName UI="Q000037" MajorTopicYN="N">antagonists & inhibitors</QualifierName>

            </MeshHeading>

            <MeshHeading>

                <DescriptorName UI="D005260" MajorTopicYN="N">Female</DescriptorName>

            </MeshHeading>

            <MeshHeading>

                <DescriptorName UI="D000077156" MajorTopicYN="N">Gefitinib</DescriptorName>

            </MeshHeading>

            <MeshHeading>

                <DescriptorName UI="D006801" MajorTopicYN="N">Humans</DescriptorName>

            </MeshHeading>

            <MeshHeading>

                <DescriptorName UI="D008297" MajorTopicYN="N">Male</DescriptorName>

            </MeshHeading>

            <MeshHeading>

                <DescriptorName UI="D008875" MajorTopicYN="N">Middle Aged</DescriptorName>

            </MeshHeading>

            <MeshHeading>

                <DescriptorName UI="D009154" MajorTopicYN="N">Mutation</DescriptorName>

            </MeshHeading>

            <MeshHeading>

                <DescriptorName UI="D047428" MajorTopicYN="N">Protein Kinase Inhibitors</DescriptorName>

                <QualifierName UI="Q000008" MajorTopicYN="Y">administration & dosage</QualifierName>

            </MeshHeading>

            <MeshHeading>

                <DescriptorName UI="D016283" MajorTopicYN="N">Proto-Oncogene Proteins p21(ras)</DescriptorName>

                <QualifierName UI="Q000235" MajorTopicYN="Y">genetics</QualifierName>

            </MeshHeading>

            <MeshHeading>

                <DescriptorName UI="D011799" MajorTopicYN="N">Quinazolines</DescriptorName>

                <QualifierName UI="Q000008" MajorTopicYN="Y">administration & dosage</QualifierName>

            </MeshHeading>

        </MeshHeadingList>

    </MedlineCitation>

    <PubmedData>

        <History>

            <PubMedPubDate PubStatus="entrez">

                <Year>2017</Year>

                <Month>9</Month>

                <Day>14</Day>

                <Hour>6</Hour>

                <Minute>0</Minute>

            </PubMedPubDate>

            <PubMedPubDate PubStatus="pubmed">

                <Year>2017</Year>

                <Month>9</Month>

                <Day>14</Day>

                <Hour>6</Hour>

                <Minute>0</Minute>

            </PubMedPubDate>

            <PubMedPubDate PubStatus="medline">

                <Year>2018</Year>

                <Month>5</Month>

                <Day>11</Day>

                <Hour>6</Hour>

                <Minute>0</Minute>

            </PubMedPubDate>

        </History>

        <PublicationStatus>ppublish</PublicationStatus>

        <ArticleIdList>

            <ArticleId IdType="pubmed">28901317</ArticleId>

            <ArticleId IdType="pii">JCanResTher_2017_13_4_699_214476</ArticleId>

            <ArticleId IdType="doi">10.4103/jcrt.JCRT_468_17</ArticleId>

        </ArticleIdList>

    </PubmedData>

</PubmedArticle>

</PubmedArticleSet>

　　方法一：xml.etree.cElementTre

# -*- coding: utf-8 -*-

"""

@Datetime: 2019/4/25

@Author: Zhang Yafei

"""

import os

import re

import threading

import xml.etree.cElementTree as ET

from concurrent.futures import ThreadPoolExecutor

from itertools import chain

import pandas as pd

def pubmed_xml_parser(path):

    dir_name = path.split('\\')[0]

    print(dir_name)

    etree = ET.parse(path)

    root = etree.getroot()

    data_list = []

    pmid_set = []

    for articles in root.iter('PubmedArticle'):

        pmid = articles.find('MedlineCitation').find('PMID').text

        if pmid in pmid_set:

            continue

        pmid_set.append(pmid)

        Article = articles.find('MedlineCitation').find('Article')

        journal = Article.find('Journal').find('ISOAbbreviation').text

        try:

            authors = Article.find('AuthorList').findall('Author')

            affiliations_info = set()

            for author in authors:

                # author_name = author.find('LastName').text + ' ' + author.find('ForeName').text

                affiliations = [x.find('Affiliation').text for x in author.findall('AffiliationInfo')]

                # author = author_name + ':' + ';'.join(affiliations)

                for affiliation in affiliations:

                    affiliations_info.add(affiliation)

            affiliations_info = ';'.join(affiliations_info)

        except AttributeError:

            affiliations_info = ''

        try:

            date = Article.find('Journal').find('JournalIssue').find('PubDate').find('Year').text

        except AttributeError:

            date = Article.find('Journal').find('JournalIssue').find('PubDate').find('MedlineDate').text

            date = re.search('\d+', date).group(0)

        try:

            mesh_words = []

            for mesh_heading in articles.find('MedlineCitation').find('MeshHeadingList').findall('MeshHeading'):

                if len(list(mesh_heading)) == 1:

                    mesh_words.append(list(mesh_heading)[0].text)

                    continue

                mesh_name = ''

                for mesh in mesh_heading:

                    if mesh.tag == 'DescriptorName':

                        mesh_name = mesh.text

                        continue

                    if mesh_name and mesh.tag == 'QualifierName':

                        mesh_word = mesh_name + '/' + mesh.text

                        mesh_words.append(mesh_word)

            mesh_words = ';'.join(mesh_words)

        except AttributeError:

            print(articles.find('MedlineCitation').find('PMID').text)

            mesh_words = ''

        article_type = '/'.join([x.text for x in Article.find('PublicationTypeList').getchildren()])

        country = articles.find('MedlineCitation').find('MedlineJournalInfo').find('Country').text

        data_list.append(

            {'PMID': pmid, 'journal': journal, 'affiliations_info': affiliations_info, 'pub_year': date,

             'mesh_words': mesh_words,

             'country': country, 'article_type': article_type, 'file_path': path})

        print(pmid + '\t解析完成')

    df = pd.DataFrame(data_list)

    with threading.Lock():

        df.to_csv('{}.csv'.format(dir_name), encoding='utf_8_sig', mode='a', index=False, header=False)

def to_excel(data, path):

    writer = pd.ExcelWriter(path)

    data.to_excel(writer, sheet_name='table', index=False)

    writer.save()

def get_files_path():

    for base_path, folders, files in os.walk('first in class drug'):

        file_list = [os.path.join(base_path, file) for file in files if file.endswith('.xml')]

    for base_path, folders, files in os.walk('follow on drug'):

        file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')])

    for base_path, folders, files in os.walk('me too drug'):

        file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')])

    if os.path.exists('first in class drug.csv') or os.path.exists('follow on drug.csv') or os.path.exists(

            'me too drug.csv'):

        if os.path.exists('first in class drug.csv'):

            df = pd.read_csv('first in class drug.csv', encoding='utf-8')

            has_files_list = df.file_path.tolist()

        if os.path.exists('follow on drug.csv'):

            df = pd.read_csv('follow on drug.csv', encoding='utf-8')

            has_files_list = chain(has_files_list, df.file_path.tolist())

        if os.path.exists('me too drug.csv'):

            df = pd.read_csv('me too drug.csv', encoding='utf-8')

            has_files_list = chain(has_files_list, df.file_path.tolist())

        print('共需解析文件：{0}'.format(len(file_list)))

        has_files_list = set(has_files_list)

        file_list = set(file_list) - has_files_list

        print('已解析文件：{0}'.format(len(has_files_list)))

    else:

        df = pd.DataFrame(

            columns=['PMID', 'affiliations_info', 'article_type', 'country', 'file_path', 'journal', 'mesh_words',

                     'pub_year'])

        df.to_csv('follow on drug.csv', encoding='utf_8_sig', index=False)

        df.to_csv('first in class drug.csv', encoding='utf_8_sig', index=False)

        df.to_csv('me too drug.csv', encoding='utf_8_sig', index=False)

        print('共需解析文件：{0}'.format(len(file_list)))

        print('已解析文件：0')

    return file_list

if __name__ == '__main__':

    files_list = get_files_path()

    if not files_list:

        print('全部解析完成')

    else:

        with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

            pool.map(pubmed_xml_parser, files_list)

　　方法二：lxml+xpath

# -*- coding: utf-8 -*-

"""

@Datetime: 2019/4/26

@Author: Zhang Yafei

"""

import os

import re

import threading

from concurrent.futures import ThreadPoolExecutor

from lxml import etree

import pandas as pd

def pubmed_xpath_parse(path):

    tree = etree.parse(path)

    # 如果xml数据中出现了关于dtd的声明(如下面的例子)，那样的话，必须在使用lxml解析xml的时候，进行相应的声明。

    # parser = etree.XMLParser(load_dtd=True)  # 首先根据dtd得到一个parser(注意dtd文件要放在和xml文件相同的目录)

    # tree = etree.parse('1.xml', parser=parser)  # 用上面得到的parser将xml解析为树结构

    data_list = []

    pmid_set = []

    for articles in tree.xpath('//PubmedArticle'):

        # pmid = articles.xpath('MedlineCitation/PMID')[0].xpath('string()')

        pmid = articles.xpath('MedlineCitation/PMID/text()')[0]

        if pmid in pmid_set:

            continue

        pmid_set.append(pmid)

        Article = articles.xpath('MedlineCitation/Article')[0]

        journal = Article.xpath('Journal/ISOAbbreviation/text()')[0]

        try:

            authors = Article.xpath('AuthorList/Author')

            affiliations_info = set()

            for author in authors:

                # author_name = author.find('LastName').text + ' ' + author.find('ForeName').text

                affiliations = [x.xpath('Affiliation/text()')[0] for x in author.xpath('AffiliationInfo')]

                # author = author_name + ':' + ';'.join(affiliations)

                for affiliation in affiliations:

                    affiliations_info.add(affiliation)

            affiliations_info = ';'.join(affiliations_info)

        except AttributeError:

            affiliations_info = ''

        try:

            date = Article.xpath('Journal/JournalIssue/PubDate/Year/text()')[0]

        except IndexError:

            date = Article.xpath('Journal/JournalIssue/PubDate/MedlineDate/text()')[0]

            date = re.search('\d+', date).group(0)

        try:

            mesh_words = []

            for mesh_heading in articles.xpath('MedlineCitation/MeshHeadingList/MeshHeading'):

                if len(mesh_heading.xpath('child::*')) == 1:

                    mesh_words.append((mesh_heading.xpath('child::*'))[0].text)

                    continue

                mesh_name = ''

                for mesh in mesh_heading.xpath('child::*'):

                    if mesh.tag == 'DescriptorName':

                        mesh_name = mesh.xpath('string()')

                        continue

                    if mesh_name and mesh.tag == 'QualifierName':

                        mesh_word = mesh_name + '/' + mesh.xpath('string()')

                        mesh_words.append(mesh_word)

            mesh_words = ';'.join(mesh_words)

        except AttributeError:

            mesh_words = ''

        article_type = '/'.join([x.xpath('./text()')[0] for x in Article.xpath('PublicationTypeList/PublicationType')])

        country = articles.xpath('MedlineCitation/MedlineJournalInfo/Country/text()')[0]

        data_list.append(

            {'PMID': pmid, 'journal': journal, 'affiliations_info': affiliations_info, 'pub_year': date,

             'mesh_words': mesh_words,

             'country': country, 'article_type': article_type, 'file_path': path})

        print(pmid + '\t解析完成')

        df = pd.DataFrame(data_list)

        with threading.Lock():

            df.to_csv('pubmed.csv', encoding='utf_8_sig', mode='a', index=False, header=False)

def to_excel(data, path):

    writer = pd.ExcelWriter(path)

    data.to_excel(writer, sheet_name='table', index=False)

    writer.save()

def get_files_path():

    for base_path, folders, files in os.walk('first in class drug'):

        file_list = [os.path.join(base_path, file) for file in files if file.endswith('.xml')]

    for base_path, folders, files in os.walk('follow on drug'):

        file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')])

    for base_path, folders, files in os.walk('me too drug'):

        file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')])

    if os.path.exists('pubmed.csv'):

        df = pd.read_csv('pubmed.csv', encoding='utf-8')

        has_files_list = df.file_path

        print('共需解析文件：{0}'.format(len(file_list)))

        file_list = set(file_list) - set(has_files_list)

        print('已解析文件：{0}'.format(len(set(has_files_list))))

    else:

        df = pd.DataFrame(columns=['PMID','affiliations_info','article_type','country','file_path','journal','mesh_words','pub_year'])

        df.to_csv('pubmed.csv', encoding='utf_8_sig', index=False)

        print('共需解析文件：{0}'.format(len(file_list)))

        print('已解析文件：0')

    return file_list

if __name__ == '__main__':

    files_list = get_files_path()

    if not files_list:

        print('全部解析完成')

    else:

        pool = ThreadPoolExecutor(max_workers=os.cpu_count())

        pool.map(pubmed_xpath_parse, files_list)

秒客网

Python解析xml文档实战案例

xml文档

方法一：xml.etree.cElementTre

方法二：lxml+xpath

相关文章