Warning: Use of undefined constant AMP_QUERY_VAR - assumed 'AMP_QUERY_VAR' (this will throw an Error in a future version of PHP) in /customers/d/8/e/bigdata-doctor.com/httpd.www/wp-content/plugins/amp/includes/amp-helper-functions.php on line 42 New Year’s resolutions post – R Code | Big Data Doctor

New Year’s resolutions post – R Code

Download PDF

This is the code associated with the post Understanding New Year’s Resolutions with Google Trends in R… You may want to have a look at the post first 🙂

library(scales)
library(googletrend) 
library(ggplot2)
 
# You need to Login http://google.com/trends on your browser!! 
 
DE<-c("gute vorsätze","rauchen+aufhoeren","abnehmen","urlaub","parship+elitepartner","fitness","zinsen+sparen","gesund")
GB<-c("good resolutions","quit smoking","diet","travel","dating","gym","save+money","healthy")
US<-c("good resolutions","quit smoking","diet","travel","dating","gym","save+money","healthy")
ES<-c("buenos+propĂłsitos","fumar","dieta","viaje","pareja","gimnasio","gastar+cuesta enero","sano")
FR<-c("bonnes resolutions","arreter de fumer","regime","vacances","meetic+eDarling","gym","économique","saine+équilibrée")
IT<-c("buoni propositi","smettere fumare","dieta","ferie","meetic+eDarling", "palestra","soldi","sano")
 
# Data gathering
goodResolutions<-NULL
for (i in 1:length(DE))
{
  df<-getTrendResolution(kw = DE[i],"DE","4/2012+33m",i)
  goodResolutions<-rbind(df,goodResolutions)
  df<-getTrendResolution(kw = GB[i],"GB","4/2012+33m",i)
  goodResolutions<-rbind(df,goodResolutions)
  df<-getTrendResolution(kw = ES[i],"ES","4/2012+33m",i)
  goodResolutions<-rbind(df,goodResolutions)
  df<-getTrendResolution(kw = FR[i],"FR","4/2012+33m",i)
  goodResolutions<-rbind(df,goodResolutions)
  df<-getTrendResolution(kw = IT[i],"IT","4/2012+33m",i)
  goodResolutions<-rbind(df,goodResolutions)  
  df<-getTrendResolution(kw = US[i],"US","4/2012+33m",i)
  goodResolutions<-rbind(df,goodResolutions)  
}
 
 
## Visualizing the trend chart with ggplot2
# Just fix the number of the group
data<-subset(goodResolutions, group==2)
data$nweek<-strptime(data$week, '%Y-%m-%d')
 
ggplot(data, aes(x=nweek,y=index, group=country,color=country)) +
  geom_vline(xintercept=as.numeric(ISOdate(2014,1,5)), linetype=4, color='darkgrey') +
  geom_vline(xintercept=as.numeric(ISOdate(2013,1,5)), linetype=4, color='darkgrey') +
  geom_line() +
  facet_wrap(~country,ncol = 3)+
  ggtitle(paste(unique(data[data$country=='GB',]$tag),collapse = '/')) +
  ylab('') + xlab('') +
  theme(strip.text.x = element_text(size = 12, colour = "black")) +
  theme(axis.text=element_text(size=12)) +
  theme(axis.title=element_text(size=14,face="bold")) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(panel.background = element_rect(fill = 'white')) +
  theme(panel.grid.major = element_line( color="snow2")) +
  theme(legend.position = "none") +
  scale_x_datetime(breaks="3 month", labels = date_format("%Y-%m")) 
 
 
# Function to generate the pairwise correlation between the trends of the different countries
# Note: it can be done more efficiently, I'm aware of that... :)
corCountries <- function (data, countries)
{
 
  corValues<- NULL
  for (i in 1:length(countries))
  {
    cor.a<- subset(data, country==countries[i])
    for (j in 1:length(countries))
    {
      cor.b <- subset(data, country==countries[j])
 
      cor.ab <- round(cor(x = cor.a$index, y = cor.b$index,use = "all.obs"),digits = 2)
      if (countries[i]==countries[j])
      {
        cor.ab<-0
      }
 
      corValues<-rbind(corValues,c(countries[i],countries[j],cor.ab))
    }
  }
  return(corValues)
}
 
countries<-c("US","GB","ES","FR","IT","DE")
corValues<-corCountries(data,countries)
corValues<-as.data.frame(corValues,row.names = NULL)
colnames(corValues)<-c("country_A","country_B","corr")
corValues$corr<-as.numeric(as.character(corValues$corr))
 
# Visualization of the correlation matrix
ggplot(corValues,aes(x=country_A, y=country_B)) +
  geom_point(aes(alpha=corr,size=(log(corr,base = 10)), color=country_B)) +
  geom_text(aes(label=corr),  size=4, vjust=2, hjust=0.5) +  
  ylab('') + xlab('') +
  ggtitle(paste(unique(data[data$country=='GB',]$tag),collapse = '/')) +
  theme(strip.text.x = element_text(size = 12, colour = "black")) +
  theme(axis.text=element_text(size=12)) +
  theme(axis.title=element_text(size=12,face="bold")) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(panel.background = element_rect(fill = 'white')) +
  theme(panel.grid.major = element_line( color="snow2")) +
  theme(axis.text = element_text(face = "bold", color = "black", size = 12)) +
  theme(legend.position="none") +
  annotate("segment", x=0, y=0, xend=6, yend=6, size=1.5,colour="lightgray")
 
################################################################################################
# My modification to gettrend function in the googletrend package 
gettrend2<-function(keyword="boston", geo=NULL, year=NULL, 
                    category=NULL, plot=TRUE,simple=TRUE, 
                    use.monthly=FALSE, compare=FALSE) 
{
 
  require(utils)
  # mod; added compare functionalities 
  if(compare) return( .gettrend.compare(keyword=keyword,
                                        geo=geo,
                                        year=year,
                                        category=category,
                                        plot=plot,
                                        simple=simple,
                                        compare=TRUE) ) 
 
  # set download directory path 
  # mod: 20-01-2014 fix for download directory path error 
  setup.download.dir <- function()
  {
 
    if( ! file.exists(.googletrend$DOWNLOADDIR) ) 
    {
      text<-sprintf(' |- error : your default browser download path [%s] was not found.', 
                    .googletrend$DOWNLOADDIR) 
      message(text)
 
      message (' |- type googletrend::setdownloaddir("your browser download path") and try again! :) ')
 
      return(NULL)
    }
 
    # cleaning up old downloaded good trend data files  
    for ( item in dir(.googletrend$DOWNLOADDIR, pattern='^report', full.names=TRUE) )
    {
      file.remove(item) # delete old trend data file 
    }
 
    return(.googletrend$DOWNLOADDIR)
  }
 
  # 
  # handling multiple keywords with comman,
  keyword=gsub(' ', "%20", keyword) # handling space 
  keyword=gsub('"', "%22", keyword) # handling double quote 
  keyword=gsub('\\+', "%2B", keyword) # operator plus 
 
 
  KEYS <- unlist( strsplit(keyword, ','))
 
  if( length(KEYS) > 1 ) 
  {
 
    L <- list()
    for(item in KEYS)
    {
      item=gsub('%20',' ', item)
      command<-sprintf('L$`%s` <- gettrend(keyword=item,geo=geo, year=year, category=category,plot=plot, use.monthly=use.monthly, simple=simple )', item)
 
      eval(parse(text=command))
 
    }
 
    message(' Note: returning R list object contains multiple keywords!')
    message(' TIP')
    message(' LIST.RESULT <-gettrend("boston,new york")')
    message(' JOINED <- googletrend::mergetrend(LIST.RESULT) # to joint them together')
 
    return(L)
  }
 
  # setup download directory 
  DOWNLOADDIR<-setup.download.dir() 
  if( is.null(DOWNLOADDIR)) return(NULL)
 
  REPORTFILES=dir(DOWNLOADDIR, pattern='^report')  
  if(length(REPORTFILES)>0) # filtering suffix extention
    REPORTFILES=REPORTFILES[ grep('.csv$', REPORTFILES) ]
 
  # handing report id number 
  if(length(REPORTFILES) == 1) # only 1st download file 
  {
    if (REPORTFILES == "report.csv") # first one 
      NEXT.REPORT.ID <- 1 
  }
 
  # never download case 
  if( length(REPORTFILES) == 0 )
    NEXT.REPORT.ID <- 0 
 
  # has been downloaded more than once 
  if( length(REPORTFILES) > 1 ) # normal case incremental report number 
  {    
    # finding next report number
    X<-gsub('report', '', REPORTFILES ) 
    X<-gsub('.csv', '', X ) 
    X<-gsub('\\(', '', X ) 
    X<-gsub('\\)', '', X ) 
    NEXT.REPORT.ID<-max( as.numeric(X), na.rm=T ) + 1    
  } 
 
 
  # handling path   
  if( NEXT.REPORT.ID == 0)
    REPORT.PATH<-paste(DOWNLOADDIR, "report.csv", sep='/') else
    {
      REPORT.PATH<-paste(DOWNLOADDIR, 'report', sep='/' )
      REPORT.PATH<-sprintf('%s(%d).csv', REPORT.PATH, NEXT.REPORT.ID)
      message(paste('download csv file path:', REPORT.PATH))
      REPORT.PATH <<- REPORT.PATH 
    }
 
  # CONSTRUCT GOOGLE TREND QUERY   
  if(!is.null(category) ) 
  {
    message( sprintf("|- ** GOOGLE CATEGORY: %s ** -|", category )) 
    trendsURL <- sprintf('http://www.google.com/trends/trendsReport?cat=%s&q=%s&content=1&export=1', category, keyword)    
  }
  else trendsURL <- sprintf('http://www.google.com/trends/trendsReport?q=%s&content=1&export=1', keyword)
 
  # handling customizing query  
  if( !is.null(geo))
    trendsURL <- sprintf('%s&geo=%s', trendsURL, geo)
 
  if( !is.null(year))
    #    trendsURL <- paste(trendsURL, '&date=1%2F', year,'%2012m', sep='')
    trendsURL <- paste(trendsURL, '&date=', year, sep='')
 
  utils::browseURL(trendsURL)
  message(trendsURL)
 
  retry=0
  while ( !file.exists(REPORT.PATH) ) 
  {
    Sys.sleep(1)
    retry<-retry+1 
    if(retry > 25) 
    {
      message(' |- Something went wrong!')
      message(' |- Did you login to your gmail account at http://www.google.com/trends?')
      message(' |- or Maybe, your browser default download directory path is different!') 
      message(' |- type googletrend::setdownloaddir("YOUR BROWSER DOWNLOAD DIRECTORY PATH")')
      stop(" |- timeout ")
    }
 
  } 
 
  # All succeed case 
{
  # Parse resonse and store in CSV
  # We skip ther first 5 rows which contain the Google header; we then read 503 rows up to the current date
  x<-datareader(file=REPORT.PATH)   
 
  #mod: chriso monthly aggregate 
  if(use.monthly)
  {   
    x$trend$week=as.Date( sprintf("%s-%s-01", as.numeric( format(x$trend$week,'%Y') ), as.numeric( format(x$trend$week,'%m') ) ) )
    x$trend=aggregate(index ~ week, data=x$trend, FUN=sum)
    message('use.monthly [ON] - ..we are using monthly Index')
  }
 
  if( !is.null(x) & simple ) 
    return(x$trend) else 
      return(x)    
}
 
} # f( gettrend )