Wednesday 14 October 2009

Scraping a users Twitter

I needed to get all the urls from a user's tweets, this is what I came up with.

#!/usr/bin/env ruby

require 'rubygems'

require 'open-uri'
require 'json'
require 'hpricot'

def get_page(url)
x = open(url).read
y = JSON::parse(x)

doc = Hpricot(y['#timeline'])
doc.search('li').each do |l|
status = l.attributes['id']
l.search("span.entry-content/a.web").each do |c|
puts "#{status} = #{c.attributes['href']}"
end
end

doc = Hpricot(y['#pagination'])
return doc.search('a').first.attributes['href']
end

user = 'twitter-user'

next_page = get_page("http://twitter.com/#{user}?page=1&format=json")

while next_page
# So we don't get banned
sleep 1
next_page = get_page("http://twitter.com#{next_page}")
end

No comments:

Post a Comment