On Github astraldragon / scraping-presentation
# Load a page html = HTTParty.get 'http://www.amazon.ca/cards-against-humanity-CAHUS-Humanity/dp/B004S8F7QM' book_page = Nokogiri::HTML(html) # Find and extract something interesting book_title = book_page.at_css('#btAsinTitle span').text().strip book_price = book_page.at_css('#actualPriceValue .priceLarge').text().gsub(/[a-z\$]/i, '').strip # Make it pretty book = { title: book_title, price: book_price } book.to_json > { "title": "Cards Against Humanity", "price": "30.00" }
response = HTTParty.get 'http://www.sjdevug.com' events_page = Nokogiri::HTML(response) events = [] events_page.css('section .event').each do | event_element | events << { name: event_element.at_css('h4').text, date: event_element.at_css('.date').text, description: event_element.at_css('.event-description').text, } end puts events.to_json
[{ "name": "Web Scraping for the Greater Good!", "date": "January 30, 2014", "description": "Want to turn a website into a structured API?..." }, { "name": "No Event Scheduled", "date": "", "description": "" }, { "name": "No Event Scheduled", "date": "", "description": "" }]
<body> <a href="http://www.sjdevug.com"></a> <a href="http://www.sjdevug.com/badurl"></a> <a href="http://www.sjdevug.com/badurl2"></a> </body>
html = File.read('sample.html') links_page = Nokogiri::HTML(html) broken_links = [] links_page.css('a').each do | link | response = HTTParty.get link[:href] broken_links << link[:href] if response.code == 404 end puts broken_links > ["http://www.sjdevug.com/badurl", "http://www.sjdevug.com/badurl2"]