#!/usr/bin/env ruby
require 'rubygems'
require 'open-uri'
require 'json'
require 'hpricot'
def get_page(url)
x = open(url).read
y = JSON::parse(x)
doc = Hpricot(y['#timeline'])
doc.search('li').each do |l|
status = l.attributes['id']
l.search("span.entry-content/a.web").each do |c|
puts "#{status} = #{c.attributes['href']}"
end
end
doc = Hpricot(y['#pagination'])
return doc.search('a').first.attributes['href']
end
user = 'twitter-user'
next_page = get_page("http://twitter.com/#{user}?page=1&format=json")
while next_page
# So we don't get banned
sleep 1
next_page = get_page("http://twitter.com#{next_page}")
end
Wednesday 14 October 2009
Scraping a users Twitter
I needed to get all the urls from a user's tweets, this is what I came up with.
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment