Skip to content

1236. Web Crawler ๐Ÿ”’

Description

Given a url startUrl and an interface HtmlParser, implement a webย crawler to crawl all links that are under theย same hostname asย startUrl.ย 

Returnย all urls obtained by your web crawler in any order.

Your crawler should:

  • Start from the page: startUrl
  • Call HtmlParser.getUrls(url) to get all urls from a webpage of given url.
  • Do not crawl the same link twice.
  • Explore only the links that are under the same hostname as startUrl.

As shown in the example url above, the hostname is example.org. For simplicity sake, you may assume allย urls use http protocol without anyย port specified. For example, the urlsย http://leetcode.com/problems andย http://leetcode.com/contest are under the same hostname, while urls http://example.org/test and http://example.com/abc are not under the same hostname.

The HtmlParser interface is defined as such:ย 

interface HtmlParser {
  // Return a list of all urls from a webpage of given url.
  public List<String> getUrls(String url);
}

Belowย are two examples explaining the functionality of the problem, for custom testing purposes you'll have threeย variablesย urls,ย edgesย andย startUrl. Notice that you will only have access toย startUrlย in your code, whileย urlsย andย edgesย are not directly accessible to you in code.

Note: Consider the same URL with the trailing slash "/" as a different URL. For example, "http://news.yahoo.com", and "http://news.yahoo.com/" are different urls.

ย 

Example 1:

Input:
urls = [
ย  "http://news.yahoo.com",
ย  "http://news.yahoo.com/news",
ย  "http://news.yahoo.com/news/topics/",
ย  "http://news.google.com",
ย  "http://news.yahoo.com/us"
]
edges = [[2,0],[2,1],[3,2],[3,1],[0,4]]
startUrl = "http://news.yahoo.com/news/topics/"
Output: [
ย  "http://news.yahoo.com",
ย  "http://news.yahoo.com/news",
ย  "http://news.yahoo.com/news/topics/",
ย  "http://news.yahoo.com/us"
]

Example 2:

Input: 
urls = [
ย  "http://news.yahoo.com",
ย  "http://news.yahoo.com/news",
ย  "http://news.yahoo.com/news/topics/",
ย  "http://news.google.com"
]
edges = [[0,2],[2,1],[3,2],[3,1],[3,0]]
startUrl = "http://news.google.com"
Output: ["http://news.google.com"]
Explanation: The startUrl links to all other pages that do not share the same hostname.

ย 

Constraints:

  • 1 <= urls.length <= 1000
  • 1 <= urls[i].length <= 300
  • startUrlย is one of the urls.
  • Hostname label must be from 1 to 63 characters long, including the dots, may contain only the ASCII letters from 'a' toย 'z', digitsย  from '0' to '9' and theย hyphen-minusย character ('-').
  • The hostname may not start or end withย the hyphen-minus character ('-').ย 
  • See:ย ย https://en.wikipedia.org/wiki/Hostname#Restrictions_on_valid_hostnames
  • You may assume there'reย no duplicates in url library.

Solutions

Solution 1

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# """
# This is HtmlParser's API interface.
# You should not implement it, or speculate about its implementation
# """
# class HtmlParser(object):
#    def getUrls(self, url):
#        """
#        :type url: str
#        :rtype List[str]
#        """


class Solution:
    def crawl(self, startUrl: str, htmlParser: 'HtmlParser') -> List[str]:
        def host(url):
            url = url[7:]
            return url.split('/')[0]

        def dfs(url):
            if url in ans:
                return
            ans.add(url)
            for next in htmlParser.getUrls(url):
                if host(url) == host(next):
                    dfs(next)

        ans = set()
        dfs(startUrl)
        return list(ans)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
/**
 * // This is the HtmlParser's API interface.
 * // You should not implement it, or speculate about its implementation
 * interface HtmlParser {
 *     public List<String> getUrls(String url) {}
 * }
 */

class Solution {
    private Set<String> ans;

    public List<String> crawl(String startUrl, HtmlParser htmlParser) {
        ans = new HashSet<>();
        dfs(startUrl, htmlParser);
        return new ArrayList<>(ans);
    }

    private void dfs(String url, HtmlParser htmlParser) {
        if (ans.contains(url)) {
            return;
        }
        ans.add(url);
        for (String next : htmlParser.getUrls(url)) {
            if (host(next).equals(host(url))) {
                dfs(next, htmlParser);
            }
        }
    }

    private String host(String url) {
        url = url.substring(7);
        return url.split("/")[0];
    }
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
/**
 * // This is the HtmlParser's API interface.
 * // You should not implement it, or speculate about its implementation
 * class HtmlParser {
 *   public:
 *     vector<string> getUrls(string url);
 * };
 */

class Solution {
public:
    vector<string> ans;
    unordered_set<string> vis;

    vector<string> crawl(string startUrl, HtmlParser htmlParser) {
        dfs(startUrl, htmlParser);
        return ans;
    }

    void dfs(string& url, HtmlParser& htmlParser) {
        if (vis.count(url)) return;
        vis.insert(url);
        ans.push_back(url);
        for (string next : htmlParser.getUrls(url))
            if (host(url) == host(next))
                dfs(next, htmlParser);
    }

    string host(string url) {
        int i = 7;
        string res;
        for (; i < url.size(); ++i) {
            if (url[i] == '/') break;
            res += url[i];
        }
        return res;
    }
};
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/**
 * // This is HtmlParser's API interface.
 * // You should not implement it, or speculate about its implementation
 * type HtmlParser struct {
 *     func GetUrls(url string) []string {}
 * }
 */

func crawl(startUrl string, htmlParser HtmlParser) []string {
    var ans []string
    vis := make(map[string]bool)
    var dfs func(url string)
    host := func(url string) string {
        return strings.Split(url[7:], "/")[0]
    }
    dfs = func(url string) {
        if vis[url] {
            return
        }
        vis[url] = true
        ans = append(ans, url)
        for _, next := range htmlParser.GetUrls(url) {
            if host(next) == host(url) {
                dfs(next)
            }
        }
    }
    dfs(startUrl)
    return ans
}

Comments