require 'rubygems' require 'uri' require 'open-uri' require 'net/http' require 'json/pure' require 'cgi' require 'hpricot' def getGuardianData(da) api_key = "foo" count = 100 u ="http://api.guardianapis.com/content/search?format=json&after=#{da}&filter=/books&filter=/global/reviews&count=#{count}&api_key=#{api_key}" url = URI.parse u puts "getting #{url} of #{da}" req = Net::HTTP::Get.new(url.request_uri) begin res = Net::HTTP.new(url.host, url.port).start {|http|http.request(req) } end j = nil begin j = JSON.parse(res.body) rescue OpenURI::HTTPError=>e case e.to_s when /^404/ raise 'Not Found' when /^304/ raise 'No Info' end end return j end def getAMZData(name,author) id = "bar" amzURL = "http://webservices.amazon.co.uk/onca/xml?Service=AWSECommerceService&AWSAccessKeyId=#{id}&Operation=ItemSearch&SearchIndex=Books" amzURL = amzURL + "&Title=#{CGI.escape(name)}" if author != nil amzURL = amzURL + "&Author=#{CGI.escape(author)}" end url = URI.parse amzURL puts "getting #{url}" req = Net::HTTP::Get.new(url.request_uri) begin res = Net::HTTP.new(url.host, url.port).start {|http|http.request(req) } end doc = nil begin doc = Hpricot.XML(res.body.to_s) end return doc end begin # make the right date for the guardian request t = DateTime.now #this is GMT apparantly t1 = DateTime.now - 2 d = t.strftime("%Y%m%d") d1 = t1.strftime("%Y%m%d") j = getGuardianData(d1)["search"]["results"] # text value for printing txt = "\nGuardian and Observer Book Reviews for #{d1}-#{d} \n\n" txt << "

Guardian and Observer book reviews with Amazon lookup

" txt << "

see blog post for more information

" now = 0 while now < j.length arr = {} z = j[now]["id"] x = j[now]["linkText"] y = j[now]["publication"] a = j[now]["webUrl"] b = j[now]["trailImage"] c = j[now]["typeSpecific"]["body"] byline = j[now]["byline"] trailText = j[now]["trailText"] trailText.gsub!(x,"")#remove dupe text tags = j[now]["tags"] puts "#{z} #{x} #{y}" now = now+1 searchText = x author="" multi=false if searchText!=nil && searchText!="" txt << "
" if searchText =~ /\|/ multi = true if searchText =~ /.*?: (.*)/ ss = $1 ar = ss.split("|") ar.each do |q| arr[q]=nil end end elsif searchText =~ /.*?: (.*?)by(.*)/ searchText = $1 author = $2 # do a little bit of hack processing to avoid the amz search failing author.gsub!(/\sand\s.*/,"") author.gsub!(/\swriting\sas\s.*/,"") author.gsub!(/\sread\sby\s.*/,"") author.gsub!(/,/,"") author.gsub!(/\é/,"e") author.gsub!(/\ó/,"o") puts "s #{searchText} a #{author}" arr[searchText]=author end type="" tags.each do |tag| if tag["filter"]=="/books/roundupreviews" type = "Paperback" end end if b!=nil && b!="" txt << "" end arr.each_key do |zz| searchText = zz author=arr[zz] #do the amazon request doc = getAMZData(searchText, author) txt << "

#{searchText}" if author!=nil txt << "by #{author}

\n" else txt << "\n" end txt << "
#{type} Review by #{byline} - from #{y}
" if !multi txt << "

#{trailText}

\n" end #process it item = doc.search("//Item")[0] if item!=nil puts (item/:ItemAttributes/:Author).inner_html puts (item/:ItemAttributes/:Title).inner_html amzU = (item/:DetailPageURL).inner_html amzASIN = (item/:ASIN).inner_html txt << "Amazon Link\n" txt << "| Google search for ASIN\n" #wait a little sleep(5) else txt << "Nothing found on Amazon\n" end txt << "| Full review\n" end txt << "\n
\n" end end file = File.new("Reviews#{d}.html", "w") file.puts(txt) end