module(... or 'crawl', package.seeall) local httputils = require('httputils') function scanHost(host, startURL, useProxy) local curl = 'curl -s -H "Accept: text/html" --max-filesize 100000 "%s"' if (useProxy) then curl = 'curl -s -H "Accept: text/html" -x http://fw.net.local:8081 --proxy-user proxyuser --max-filesize 100000 "%s"' end local urls = {} local urls_queued = {} local abshost = "http://"..host function addUrl(url) if (urls_queued[url] == nil and #url < 255) then table.insert(urls, url) urls_queued[url] = true end end addUrl(startURL) while #urls > 0 do local url = urls[1]; local cmd = string.format(curl, url) table.remove(urls, 1) print(url) local cd = io.popen (cmd , 'r') content = cd:read("*all") io.close(cd) --print('content:') --print(content) --os.exit(1) local absURL = "[=(]['\"]([^'\"]-://[^'\"]-)['\"]" local relURL = "[=(]['\"]([%s]-/[^'\"]-)['\"]" for word in string.gmatch(content, absURL) do local u = httputils.HtmlDecode(word) if string.find(u, host) then addUrl(u) end end for word in string.gmatch(content, relURL) do local u = httputils.HtmlDecode(word) if (string.sub(u, 1, 4) ~= "/wEP") and (string.find(u, '//') == nil) then u = abshost..u addUrl(u) end end end end --scanHost("localhost:1467", "http://localhost:1467/", false); scanHost("localhost:1470", "http://localhost:1470/nl-NL/Home.aspx", false); module(... or 'httputils', package.seeall) local htmlEncodeEntities = { ['&'] = 'amp', [' '] = 'nbsp', ['"'] = 'quot', ['<'] = 'lt', ['>'] = 'gt' } local htmlDecodeEntities = { } local function fillHtmlDecodeEntities() for k,v in pairs(htmlEncodeEntities) do htmlDecodeEntities[v] = k end end fillHtmlDecodeEntities(); local function EncodeHtmlEntity(char) if (htmlEncodeEntities[char]) then return '&'..htmlEncodeEntities[char]..';' else return char end end local function DecodeHtmlEntity(entity) if (htmlDecodeEntities[entity]) then return htmlDecodeEntities[entity] else return entity end end function HtmlEncode(text) return string.gsub(text, "(.)", EncodeHtmlEntity) end function HtmlDecode(html) return string.gsub(html, "&(.-);", DecodeHtmlEntity) end function UrlEncode() end function UrlDecode() end
24300cookie-checkLua Crawl Host