Skip to content

Commit

Permalink
Fixed sitemap.xml parsing (Fixes #107)
Browse files Browse the repository at this point in the history
  • Loading branch information
s0md3v authored Jan 25, 2019
1 parent c019a04 commit c1fb0cf
Showing 1 changed file with 4 additions and 4 deletions.
8 changes: 4 additions & 4 deletions photon.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ def xmlParser(response):
return findall(r'<loc>(.*?)</loc>', response)


def zap(url):
def zap(inputUrl):
"""Extract links from robots.txt and sitemap.xml."""
if args.archive:
from plugins.wayback import time_machine
Expand All @@ -356,7 +356,7 @@ def zap(url):
verb('Internal page', url)
internal.add(url)
# Makes request to robots.txt
response = requests.get(url + '/robots.txt', verify=False).text
response = requests.get(inputUrl + '/robots.txt', verify=False).text
# Making sure robots.txt isn't some fancy 404 page
if '<body' not in response:
# If you know it, you know it
Expand All @@ -376,15 +376,15 @@ def zap(url):
robots.add(url)
print('%s URLs retrieved from robots.txt: %s' % (good, len(robots)))
# Makes request to sitemap.xml
response = requests.get(url + '/sitemap.xml', verify=False).text
response = requests.get(inputUrl + '/sitemap.xml', verify=False).text
# Making sure robots.txt isn't some fancy 404 page
if '<body' not in response:
matches = xmlParser(response)
if matches: # if there are any matches
print('%s URLs retrieved from sitemap.xml: %s' % (
good, len(matches)))
for match in matches:
verb('Internal page', url)
verb('Internal page', match)
# Cleaning up the URL and adding it to the internal list for
# crawling
internal.add(match)
Expand Down

0 comments on commit c1fb0cf

Please sign in to comment.