On Github astraldragon / scraping-presentation
# Load a page
html = HTTParty.get 'http://www.amazon.ca/cards-against-humanity-CAHUS-Humanity/dp/B004S8F7QM'
book_page = Nokogiri::HTML(html)
# Find and extract something interesting
book_title = book_page.at_css('#btAsinTitle span').text().strip
book_price = book_page.at_css('#actualPriceValue .priceLarge').text().gsub(/[a-z\$]/i, '').strip
# Make it pretty
book = { title: book_title, price: book_price }
book.to_json
> { "title": "Cards Against Humanity", "price": "30.00" }
response = HTTParty.get 'http://www.sjdevug.com'
events_page = Nokogiri::HTML(response)
events = []
events_page.css('section .event').each do | event_element |
events << {
name: event_element.at_css('h4').text,
date: event_element.at_css('.date').text,
description: event_element.at_css('.event-description').text,
}
end
puts events.to_json
[{
"name": "Web Scraping for the Greater Good!",
"date": "January 30, 2014",
"description": "Want to turn a website into a structured API?..."
},
{
"name": "No Event Scheduled",
"date": "",
"description": ""
},
{
"name": "No Event Scheduled",
"date": "",
"description": ""
}]
<body>
<a href="http://www.sjdevug.com"></a>
<a href="http://www.sjdevug.com/badurl"></a>
<a href="http://www.sjdevug.com/badurl2"></a>
</body>
html = File.read('sample.html')
links_page = Nokogiri::HTML(html)
broken_links = []
links_page.css('a').each do | link |
response = HTTParty.get link[:href]
broken_links << link[:href] if response.code == 404
end
puts broken_links
> ["http://www.sjdevug.com/badurl", "http://www.sjdevug.com/badurl2"]