2015-09-22 8 views
1

Это то, что я могу сделать легко в Excel. Но Im запутано R.R: сопоставление каждой строки в списке с другим списком при возврате смежного значения

Я хотел бы присвоить названия стран длинному списку строк («принадлежность»).

c("Department of Psychiatry and Behavioural Sciences, University College London Medical School, UK.", 
"", "Ty Dewi Sant School of Nursing, University Hospital of Wales, College of Medicine, Cardiff.", 
"University of Massachusetts Medical Center.", "Older Women's League.", 
"Kimberly Quality Care, Boston, MA.", "Michaux Manor Living Center, Fayetteville, PA.", 
"Florida Diagnostic and Learning Resources System, University of South Florida, Tampa 33613.", 
"", "Bigel Institute for Health Policy, Brandeis University, Waltham, MA.", 
"", "York Health Authority.", "Southern Illinois University, Edwardsville.", 
"St. Joseph's Hospital, Memphis, TN.", "Long Term Home Care of the Frail Elderly Foundation, New York City.", 
"Catholic University of America, Washington, DC.", "Mercy Health Center, Oklahoma City, OK.", 
"", "Visiting Nurse Service of New York.", "RespiteCare Center, Evanston, IL.", 
"Camden and Islington HA.", "National Advisory Council on Aging.", 
"Visiting Nurse Service of New York.", "American Health Care Association, Washington, DC.", 
"HealthCare Partners Medical Group, Los Angeles, CA 90015, USA.", 
"Tad Publishing Company, Peoria, IL, USA.", "Child Health Investment Partnership, Roanoke, VA, USA.", 
"School of Public Health, State University of New York, Albany 12237, USA.", 
"Bundoora Extended Care Centre.", "", "", "Family Respite Center, Falls Church, VA, USA.", 
"", "University of Victoria.", "", "Homemaker Health Aide Service of the National Capital Area.", 
"West Lambeth Health Authority, London SE1 7EH.", "Bon Secours Hospital/Villa Maria Nursing Center, North Miami, FL 33161.", 
"Alzheimer's Disease and Related Disorders Association, Syracuse, NY.", 
"Alzheimer's Association, Washington DC.", "South Carolina Commission on Aging, Columbia.", 
"University of New Mexico College of Nursing.", "Department of Human Development and Family Studies, University of Alabama, Tuscaloosa.", 
"Ballard Health Care Residence, Des Plaines, IL.", "Bowman Gray School of Medicine of Wake Forest University, Winston-Salem, NC.", 
"Case Western Reserve University.", "School of Public and Environmental Administration, Indiana University, Indianapolis 46202.", 
"Manor HealthCare Corp, Silver Spring, MD.", "Relationship Builders, Napa, CA.", 
"", "", "Medical University of South Carolina, USA.", "Tokyo Metropolitan Institute of Gerontology, Itabashi, Japan. [email protected]", 
"Medical University of South Carolina, USA.", "Royal Hospital for Sick Children, Bristol.", 
"Barefield, Ennis, Co. Clare., Ireland.", "North Georgia College, Dahlonega 30597, USA.", 
"Institute for Psychology (I), University of Wurzburg, Germany.", 
"Camborne Redruth Community Hospital, Cornwall, United Kingdom.", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "Institute of Child Health and Great Ormond Street Hospital for Children NHS Trust, London, UK.", 
"Department of Psychiatry, University of Toronto, Toronto, Ontario, Canada. [email protected]", 
"Boston University School of Social Work, MA 02215, USA.", "", 
"Neurosciences Unit, General Infirmary at Leeds.", "", "", "School of Kang-Ning Junior College of Nursing, Nei-Hu, Taiwan, ROC.", 
"College of Nursing, South Dakota State University, USA.", "Department of Geriatric Medicine, University of Manchester, UK.", 
"Southern Illinois University, Department of Social Work, Edwardsville 62026-1450, USA.", 
"Redlands Community College, El Reno, Oklahoma, USA.", "", "", 
"Department of Geriatric Medicine, Alexandra Hospital, Singapore.", 
"School of Nursing and Midwifery, Department of Gerontological and Continuing Care Nursing, University of Sheffield, Sheffield, England. [email protected]", 
"", "State University of New York, Health Science Center at Syracuse,, USA. [email protected]", 
"Div. of Active Palliative Care, Todachuo General Hospital.", 
"Children and Young People's Kidney Unit, Nottingham City Hospital, NHS Trust, UK.", 
"School of Nursing & Midwifery, Department of Gerontological & Continuing Care Nursing, University of Sheffield. [email protected]", 
"Harrington Memorial Hospital, Southbridge, MA, USA.", "", "Department of Curriculum and Instruction, Iowa State University, Ames, 50011. USA.", 
"Children & Young People's Kidney Unit, Nottingham City Hospital, U.K.", 
"School of Social Work, Boston University, MA 02215, USA. [email protected]", 
"Royal Free Hospital, London, UK.", "Humboldt State University, Department of Nursing, Arcata, CA, USA.", 
"Department of Psychiatry, The University of Queensland, Mental Health Centre, Royal Brisbane Hospital, Herston, Australia. [email protected]", 
"Centre for Evidence Based Nursing, University of York, Heslington, York, Nth Yorkshire, UK, YO1 5DG. [email protected]", 
"School of Nursing, University of British Columbia, Vancouver. [email protected]", 
"Medisinsk avdeling, Lovisenberg Diakonale Sykehus, Oslo.", "School of Nursing, Yale University, USA.", 
"Centre de la Mémoire, Hôpital Roger Salengro, Centre Hospitalier Universitaire, Lille.", 
"University of Ulster and Eastern Health and Social Services Board, Ulster, Northern Ireland. [email protected]", 
"Thames Valley Family Practice Research Unit, Department of Family Medicine's Centre for Studies in Family Medicine, University of Western Ontario (UWO), London. [email protected]", 
"", "", "Department of Special Education, University of Nijmegen, The Netherlands. [email protected]", 
"European Institute of Health and Medical Sciences, University of Surrey, Guildford, England.", 
"California State University School of Nursing, Chico, USA.") 

Внутри каждой строки может быть или не быть подстрока, относящаяся к местоположению, которое само по себе может относиться к стране. Предполагаемый выход является dataframe следующим образом:

Affiliation[1], matchedCountry 
Affiliation[2], matchedCountry 
... 
Affiliation[n], matchedCountry 

«matchedCountry» означает оцениваться на основе нескольких списков (университетов, городов Великобритании, США говорится, и т.д.) и NA допускается. И некоторые списки только возвращают коды ISO.

Основываясь на обратной связи до сих пор (спасибо @rbm), мне удалось найти решение (см. Раздел ответа), который хорошо выполняет работу. Тем не менее, я уверен, что производительность еще может быть улучшена. Благодарю.

Список литературы:

  1. Simultaneously merge multiple data.frames in a list
  2. R grepl: quickly match multiple strings against multiple substrings, returning all matches
  3. R grep: Match one string against multiple patterns
  4. Speedy test on R data frame to see if row values in one column are inside another column in the data frame
  5. Extract & combine multiple substrings using multiple patterns from some but not all strings contained in list & return to list in R
  6. How to detect substrings from multiple lists within a string in R
+1

Каков ваш желаемый результат? Уникальный список всех стран, которые происходят? Или 'data.frame' с вашим _affiliationList_ в качестве столбца и страны в качестве второго столбца? – RHA

+3

Рассмотрите обзор руководящих принципов для [mcve]. Я думаю, что вам не хватает «минимальной» части в этом вопросе. – Frank

+0

Это может помочь: http://stackoverflow.com/questions/32714927/how-to-detect-substrings-from-multiple-lists-within-a-string-in-r – rbm

ответ

0

Вот решение, которое проверяет различные списки подстрок по каждому элементу в главном списке, а затем в зависимости от списка возвращает либо: a) исходную подстроку, b) смежную подстроку, либо c) фиксированную/предварительную -определенное значение. Результатом является исходная таблица со столбцом «страна».

Эти условия представлены в примере кода.

Редактировать: похоже, поиск домена не работает должным образом. Я не совсем уверен, как устранить/исправить, но это выходит за рамки этого вопроса, я думаю ...

######### GENERATE COUNTRY ID ############# 

    library("stringr") 
    library(RCurl) 

    ## Download country lists and perpetrate 

    countryList <- getURL("https://raw.githubusercontent.com/umpirsky/country-list/master/country/icu/en_US/country.csv") 
    usstatesList <- getURL("https://raw.githubusercontent.com/jasonong/List-of-US-States/master/states.csv") 
    ukcitiesList <- getURL("https://raw.githubusercontent.com/encyclopediaio/list-of-cities-in-the-uk/master/src/uk_cities.csv") 
    ukcountryList <- getURL("https://raw.githubusercontent.com/Gibbs/UK-Postcodes/master/postcodes.csv") 
    universitiesList <- getURL("https://raw.githubusercontent.com/endSly/world-universities-csv/master/world-universities.csv") 

    countryList <- read.csv(text = countryList, stringsAsFactors=FALSE) 
    usstatesList <- read.csv(text = usstatesList, stringsAsFactors=FALSE) 
    ukcitiesList <- read.csv(text = ukcitiesList, stringsAsFactors=FALSE) 
    ukcountryList <- read.csv(text = ukcountryList, stringsAsFactors=FALSE) 
    universitiesList <- read.csv(text = universitiesList, header = FALSE, stringsAsFactors=FALSE) 

    ## Generate affiliation list from ronbun data 
    affiliationList <- pub.data$Affiliation1 

    ## Generate email domains column and add to countryList 
    domains <- function(x) 
    { 
    x <- tolower(x) 
    x <- paste0(".", x) 
    return(x) 
    } 

    countryList <- data.frame(countryList[c("name", "iso")], domain = domains(countryList$iso), stringsAsFactors = FALSE) 


    ## Add country names to universitiesList as V4 

    universitiesList <- data.frame(universitiesList, V4="", stringsAsFactors = FALSE) 

    i = 0 
    for (v in universitiesList$V1) 
    { 
    tryCatch({ 
    i = i + 1 
     if (sum(str_detect(v, countryList$iso)) > 0) { 
     universitiesList$V4[i] <- countryList$name[which(str_detect(v, countryList$iso))] 
     } 
    }, error=function(e){}) 
    } 

    ### on to the main show 

    df <- data.frame(affiliationList, CountryISO="", CountryNAME="", stringsAsFactors = FALSE) 


    i = 0 
    for (v in affiliationList) 
    { 
    tryCatch({ 
    i = i + 1 
     if (sum(str_detect(v, countryList$name)) > 0) { 
     df$CountryISO[i] <- countryList$iso[which(str_detect(v, countryList$name))] 
     df$CountryNAME[i] <- countryList$name[which(str_detect(v, countryList$name))] 
     } 
     if (sum(str_detect(v, ukcitiesList$name)) > 0) { 
     df$CountryISO[i] <- "GB" 
     df$CountryNAME[i] <- "United Kingdom" 
     } 
     if (sum(str_detect(v, ukcountryList$country_string)) > 0) { 
     df$CountryISO[i] <- "GB" 
     df$CountryNAME[i] <- "United Kingdom" 
     } 
     if (sum(str_detect(v, usstatesList$State)) > 0 || sum(str_detect(v, usstatesList$Abbreviation)) > 0) { 
     df$CountryISO[i] <- "US" 
     df$CountryNAME[i] <- "United States" 
     } 
     if (sum(str_detect(v, countryList$domain)) > 0) { 
     df$CountryISO[i] <- countryList$iso[which(str_detect(v, countryList$domain))] 
     df$CountryNAME[i] <- countryList$name[which(str_detect(v, countryList$domain))] 
     } 
     if (sum(str_detect(v, universitiesList$V2)) > 0) { 
     df$CountryISO[i] <- universitiesList$V1[which(str_detect(v, universitiesList$V2))] 
     df$CountryNAME[i] <- universitiesList$V1[which(str_detect(v, universitiesList$V4))] 
     } 
    }, error=function(e){}) 
    } 

return(df) 

Спасибо за помощь при условии!